diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H index 64b6bec6729..ae649093636 100644 --- a/Src/Base/AMReX_GpuAtomic.H +++ b/Src/Base/AMReX_GpuAtomic.H @@ -18,8 +18,7 @@ namespace Gpu::Atomic { // For LogicalOr and LogicalAnd, the data type is int. // For Exch and CAS, the data type is generic. // All these functions are non-atomic in host code!!! -// If one needs them to be atomic in host code, use HostDevice::Atomic::*. Currently only -// HostDevice::Atomic::Add is supported. We could certainly add more. +// If one needs them to be atomic in host code, use HostDevice::Atomic::*. // If we add more types for atomicAdd, we also need to update HasAtomicAdd in AMReX_TypeTraits.H. /// \cond DOXYGEN_IGNORE @@ -617,6 +616,21 @@ namespace HostDevice::Atomic { *sum += value; } + template + AMREX_FORCE_INLINE + T FetchAdd_Host (T* const sum, T const value) noexcept + { + T old; +#ifdef AMREX_USE_OMP +#pragma omp atomic capture +#endif + { + old = *sum; + *sum += value; + } + return old; + } + template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Add (T* const sum, T const value) noexcept @@ -625,6 +639,14 @@ namespace HostDevice::Atomic { AMREX_IF_ON_HOST((Add_Host(sum,value);)) } + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T FetchAdd (T* const sum, T const value) noexcept + { + AMREX_IF_ON_DEVICE((return Gpu::Atomic::Add(sum,value);)) + AMREX_IF_ON_HOST((return FetchAdd_Host(sum,value);)) + } + } #ifdef AMREX_USE_GPU diff --git a/Src/Particle/AMReX_NeighborParticles.H b/Src/Particle/AMReX_NeighborParticles.H index 8e13f7a44d3..8d348c4f795 100644 --- a/Src/Particle/AMReX_NeighborParticles.H +++ b/Src/Particle/AMReX_NeighborParticles.H @@ -16,6 +16,7 @@ namespace amrex { struct NeighborCode { int grid_id; + Box grid_box; IntVect periodic_shift; }; diff --git a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H index 45b3b4ba228..d165ecc01e7 100644 --- a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H +++ b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H @@ -99,6 +99,7 @@ buildNeighborMask () { NeighborCode code; code.grid_id = nbor_grid.grid_id; + code.grid_box = ba[nbor_grid.grid_id]; code.periodic_shift = nbor_grid.periodic_shift; h_code_arr.push_back(code); h_isec_boxes.push_back(nbor_grid.box); @@ -170,6 +171,8 @@ buildNeighborCopyOp (bool use_boundary_neighbor) auto p_code_array = m_code_array[gid].dataPtr(); auto p_isec_boxes = m_isec_boxes[gid].dataPtr(); const int nisec_box = m_isec_boxes[gid].size(); + const bool do_tiling = this->do_tiling; + const IntVect tile_size = this->tile_size; // auto p_code_offsets = m_code_offsets[gid].dataPtr(); AMREX_FOR_1D ( np, i, @@ -194,12 +197,14 @@ buildNeighborCopyOp (bool use_boundary_neighbor) Gpu::dtoh_memcpy_async(&num_copies, offsets.data()+np, sizeof(int)); Gpu::streamSynchronize(); - neighbor_copy_op.resize(gid, lev, num_copies); + neighbor_copy_op.resize(gid, tid, lev, num_copies); - auto p_boxes = neighbor_copy_op.m_boxes[lev][gid].dataPtr(); - auto p_levs = neighbor_copy_op.m_levels[lev][gid].dataPtr(); - auto p_src_indices = neighbor_copy_op.m_src_indices[lev][gid].dataPtr(); - auto p_periodic_shift = neighbor_copy_op.m_periodic_shift[lev][gid].dataPtr(); + auto tile_index = std::make_pair(gid, tid); + auto p_boxes = neighbor_copy_op.m_boxes[lev][tile_index].dataPtr(); + auto p_levs = neighbor_copy_op.m_levels[lev][tile_index].dataPtr(); + auto p_tiles = neighbor_copy_op.m_tiles[lev][tile_index].dataPtr(); + auto p_src_indices = neighbor_copy_op.m_src_indices[lev][tile_index].dataPtr(); + auto p_periodic_shift = neighbor_copy_op.m_periodic_shift[lev][tile_index].dataPtr(); Gpu::streamSynchronize(); AMREX_FOR_1D ( np, i, @@ -213,6 +218,9 @@ buildNeighborCopyOp (bool use_boundary_neighbor) for (int j=0; jdo_tiling == 0, + "Tiling on the GPU is not supported for neighbor particles."); + buildNeighborMask(); this->defineBufferMap(); diff --git a/Src/Particle/AMReX_ParticleBufferMap.H b/Src/Particle/AMReX_ParticleBufferMap.H index 33cec4b8236..656fef35dc9 100644 --- a/Src/Particle/AMReX_ParticleBufferMap.H +++ b/Src/Particle/AMReX_ParticleBufferMap.H @@ -13,39 +13,45 @@ namespace amrex { struct GetPID { const int* m_bucket_to_pid; - const int* m_lev_gid_to_bucket; + const int* m_lev_gid_tid_to_bucket; const int* m_lev_offsets; + const int* m_gid_offsets; GetPID (const Gpu::DeviceVector& bucket_to_pid, - const Gpu::DeviceVector& lev_gid_to_bucket, - const Gpu::DeviceVector& lev_offsets) + const Gpu::DeviceVector& lev_gid_tid_to_bucket, + const Gpu::DeviceVector& lev_offsets, + const Gpu::DeviceVector& gid_offsets) : m_bucket_to_pid(bucket_to_pid.dataPtr()), - m_lev_gid_to_bucket(lev_gid_to_bucket.dataPtr()), - m_lev_offsets(lev_offsets.dataPtr()) + m_lev_gid_tid_to_bucket(lev_gid_tid_to_bucket.dataPtr()), + m_lev_offsets(lev_offsets.dataPtr()), + m_gid_offsets(gid_offsets.dataPtr()) {} AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - int operator() (const int lev, const int gid) const noexcept + int operator() (const int lev, const int gid, const int tid) const noexcept { - return m_bucket_to_pid[m_lev_gid_to_bucket[m_lev_offsets[lev]+gid]]; + return m_bucket_to_pid[m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev]+gid] + tid]]; } }; struct GetBucket { - const int* m_lev_gid_to_bucket; + const int* m_lev_gid_tid_to_bucket; const int* m_lev_offsets; - - GetBucket (const int* lev_gid_to_bucket_ptr, - const int* lev_offsets_ptr) - : m_lev_gid_to_bucket(lev_gid_to_bucket_ptr), - m_lev_offsets(lev_offsets_ptr) + const int* m_gid_offsets; + + GetBucket (const int* lev_gid_tid_to_bucket_ptr, + const int* lev_offsets_ptr, + const int* gid_offsets_ptr) + : m_lev_gid_tid_to_bucket(lev_gid_tid_to_bucket_ptr), + m_lev_offsets(lev_offsets_ptr), + m_gid_offsets(gid_offsets_ptr) {} AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - int operator() (const int lev, const int gid) const noexcept + int operator() (const int lev, const int gid, const int tid) const noexcept { - return m_lev_gid_to_bucket[m_lev_offsets[lev]+gid]; + return m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev]+gid] + tid]; } }; @@ -56,28 +62,40 @@ class ParticleBufferMap Vector m_dm; Vector m_bucket_to_gid; + Vector m_bucket_to_tid; Vector m_bucket_to_lev; Vector m_bucket_to_pid; - Vector m_lev_gid_to_bucket; + Vector m_lev_gid_tid_to_bucket; Vector m_lev_offsets; + Vector m_gid_offsets; Vector m_proc_box_counts; Vector m_proc_box_offsets; + bool m_do_tiling{false}; + IntVect m_tile_size{AMREX_D_DECL(1024000, 1024000, 1024000)}; + Gpu::DeviceVector d_bucket_to_pid; - Gpu::DeviceVector d_lev_gid_to_bucket; + Gpu::DeviceVector d_lev_gid_tid_to_bucket; Gpu::DeviceVector d_lev_offsets; + Gpu::DeviceVector d_gid_offsets; public: ParticleBufferMap () = default; ParticleBufferMap (const ParGDBBase* a_gdb); + ParticleBufferMap (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size); + void define (const ParGDBBase* a_gdb); + void define (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size); + bool isValid (const ParGDBBase* a_gdb) const; + bool isValid (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size) const; + [[nodiscard]] AMREX_FORCE_INLINE int numLevels () const { @@ -99,6 +117,13 @@ public: return m_bucket_to_gid[bid]; } + [[nodiscard]] AMREX_FORCE_INLINE + int bucketToTile (int bid) const + { + AMREX_ASSERT(m_defined); + return m_bucket_to_tid[bid]; + } + [[nodiscard]] AMREX_FORCE_INLINE int bucketToLevel (int bid) const { @@ -115,9 +140,16 @@ public: [[nodiscard]] AMREX_FORCE_INLINE int gridAndLevToBucket (int gid, int lev) const + { + AMREX_ASSERT(!m_do_tiling); + return gridAndTileAndLevToBucket(gid, 0, lev); + } + + [[nodiscard]] AMREX_FORCE_INLINE + int gridAndTileAndLevToBucket (int gid, int tid, int lev) const { AMREX_ASSERT(m_defined); - return m_lev_gid_to_bucket[m_lev_offsets[lev] + gid]; + return m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev] + gid] + tid]; } [[nodiscard]] AMREX_FORCE_INLINE @@ -148,14 +180,21 @@ public: [[nodiscard]] AMREX_FORCE_INLINE int procID (int gid, int lev) const + { + AMREX_ASSERT(!m_do_tiling); + return procID(gid, 0, lev); + } + + [[nodiscard]] AMREX_FORCE_INLINE + int procID (int gid, int tid, int lev) const { AMREX_ASSERT(m_defined); - return m_dm[lev][gid]; + return m_bucket_to_pid[gridAndTileAndLevToBucket(gid, tid, lev)]; } - [[nodiscard]] GetPID getPIDFunctor () const noexcept { return GetPID(d_bucket_to_pid, d_lev_gid_to_bucket, d_lev_offsets);} - [[nodiscard]] GetBucket getBucketFunctor () const noexcept { return GetBucket(d_lev_gid_to_bucket.data(), d_lev_offsets.data());} - [[nodiscard]] GetBucket getHostBucketFunctor () const noexcept { return GetBucket(m_lev_gid_to_bucket.data(), m_lev_offsets.data());} + [[nodiscard]] GetPID getPIDFunctor () const noexcept { return GetPID(d_bucket_to_pid, d_lev_gid_tid_to_bucket, d_lev_offsets, d_gid_offsets);} + [[nodiscard]] GetBucket getBucketFunctor () const noexcept { return GetBucket(d_lev_gid_tid_to_bucket.data(), d_lev_offsets.data(), d_gid_offsets.data());} + [[nodiscard]] GetBucket getHostBucketFunctor () const noexcept { return GetBucket(m_lev_gid_tid_to_bucket.data(), m_lev_offsets.data(), m_gid_offsets.data());} }; diff --git a/Src/Particle/AMReX_ParticleBufferMap.cpp b/Src/Particle/AMReX_ParticleBufferMap.cpp index 95fd1a9b69c..d9b3d72a484 100644 --- a/Src/Particle/AMReX_ParticleBufferMap.cpp +++ b/Src/Particle/AMReX_ParticleBufferMap.cpp @@ -1,17 +1,32 @@ #include +#include namespace amrex { ParticleBufferMap::ParticleBufferMap (const ParGDBBase* a_gdb) { - define(a_gdb); + define(a_gdb, false, IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))); +} + +ParticleBufferMap::ParticleBufferMap (const ParGDBBase* a_gdb, bool a_do_tiling, + const IntVect& a_tile_size) +{ + define(a_gdb, a_do_tiling, a_tile_size); } void ParticleBufferMap::define (const ParGDBBase* a_gdb) +{ + define(a_gdb, false, IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))); +} + +void ParticleBufferMap::define (const ParGDBBase* a_gdb, bool a_do_tiling, + const IntVect& a_tile_size) { BL_PROFILE("ParticleBufferMap::define"); m_defined = true; + m_do_tiling = a_do_tiling; + m_tile_size = a_tile_size; int num_levels = a_gdb->finestLevel()+1; m_ba.resize(0); @@ -27,57 +42,75 @@ void ParticleBufferMap::define (const ParGDBBase* a_gdb) m_lev_offsets.resize(0); m_lev_offsets.push_back(0); for (int lev = 0; lev < num_levels; ++lev) { - m_lev_offsets.push_back(m_lev_offsets.back() +static_cast( m_ba[lev].size())); + m_lev_offsets.push_back(m_lev_offsets.back() + static_cast(m_ba[lev].size()) + 1); } - int num_buckets = m_lev_offsets.back(); + m_gid_offsets.resize(m_lev_offsets.back(), 0); + + int num_buckets = 0; + for (int lev = 0; lev < num_levels; ++lev) { + int level_offset = m_lev_offsets[lev]; + for (int gid = 0; gid < m_ba[lev].size(); ++gid) { + m_gid_offsets[level_offset + gid] = num_buckets; + num_buckets += numTilesInBox(m_ba[lev][gid], m_do_tiling, m_tile_size); + } + m_gid_offsets[level_offset + static_cast(m_ba[lev].size())] = num_buckets; + } m_bucket_to_gid.resize(0); m_bucket_to_gid.resize(num_buckets); + m_bucket_to_tid.resize(0); + m_bucket_to_tid.resize(num_buckets); m_bucket_to_lev.resize(0); m_bucket_to_lev.resize(num_buckets); m_bucket_to_pid.resize(0); m_bucket_to_pid.resize(num_buckets); - m_lev_gid_to_bucket.resize(0); - m_lev_gid_to_bucket.resize(num_buckets); + m_lev_gid_tid_to_bucket.resize(0); + m_lev_gid_tid_to_bucket.resize(num_buckets); - using ThreeIntTuple = std::tuple; - std::vector box_lev_proc_ids; + using FourIntTuple = std::tuple; + std::vector box_tile_lev_proc_ids; for (int lev = 0; lev < num_levels; ++lev) { for (int i = 0; i < m_ba[lev].size(); ++i) { int rank = ParallelContext::global_to_local_rank(m_dm[lev][i]); - box_lev_proc_ids.emplace_back(i, lev, rank); + int num_tiles = numTilesInBox(m_ba[lev][i], m_do_tiling, m_tile_size); + for (int tid = 0; tid < num_tiles; ++tid) { + box_tile_lev_proc_ids.emplace_back(i, tid, lev, rank); + } } } - std::sort(box_lev_proc_ids.begin(), box_lev_proc_ids.end(), - [](const ThreeIntTuple& a, const ThreeIntTuple& b) -> bool + std::sort(box_tile_lev_proc_ids.begin(), box_tile_lev_proc_ids.end(), + [](const FourIntTuple& a, const FourIntTuple& b) -> bool { - int pid_a = std::get<2>(a); - int pid_b = std::get<2>(b); + int pid_a = std::get<3>(a); + int pid_b = std::get<3>(b); if (pid_a != pid_b) { return pid_a < pid_b; } - int lev_a = std::get<1>(a); - int lev_b = std::get<1>(b); + int lev_a = std::get<2>(a); + int lev_b = std::get<2>(b); if (lev_a != lev_b) { return lev_a < lev_b; } int gid_a = std::get<0>(a); int gid_b = std::get<0>(b); if (gid_a != gid_b) { return gid_a < gid_b; } + int tid_a = std::get<1>(a); + int tid_b = std::get<1>(b); + if (tid_a != tid_b) { return tid_a < tid_b; } + return false; }); int bucket_index = 0; - for (int lev = 0; lev < num_levels; ++lev) { - for (int i = 0; i < m_ba[lev].size(); ++i) { - m_bucket_to_gid[bucket_index] = std::get<0>(box_lev_proc_ids[bucket_index]); - m_bucket_to_lev[bucket_index] = std::get<1>(box_lev_proc_ids[bucket_index]); - m_bucket_to_pid[bucket_index] = std::get<2>(box_lev_proc_ids[bucket_index]); - ++bucket_index; - } + for (int i = 0; i < num_buckets; ++i) { + m_bucket_to_gid[bucket_index] = std::get<0>(box_tile_lev_proc_ids[bucket_index]); + m_bucket_to_tid[bucket_index] = std::get<1>(box_tile_lev_proc_ids[bucket_index]); + m_bucket_to_lev[bucket_index] = std::get<2>(box_tile_lev_proc_ids[bucket_index]); + m_bucket_to_pid[bucket_index] = std::get<3>(box_tile_lev_proc_ids[bucket_index]); + ++bucket_index; } m_proc_box_counts.resize(0); @@ -86,8 +119,10 @@ void ParticleBufferMap::define (const ParGDBBase* a_gdb) for (int i = 0; i < num_buckets; ++i) { int lev = m_bucket_to_lev[i]; + int gid = m_bucket_to_gid[i]; + int tid = m_bucket_to_tid[i]; int pid = m_bucket_to_pid[i]; - m_lev_gid_to_bucket[m_lev_offsets[lev] + m_bucket_to_gid[i]] = i; + m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev] + gid] + tid] = i; m_proc_box_counts[pid]++; } @@ -100,21 +135,33 @@ void ParticleBufferMap::define (const ParGDBBase* a_gdb) d_bucket_to_pid.resize(0); d_bucket_to_pid.resize(num_buckets); - d_lev_gid_to_bucket.resize(0); - d_lev_gid_to_bucket.resize(num_buckets); + d_lev_gid_tid_to_bucket.resize(0); + d_lev_gid_tid_to_bucket.resize(num_buckets); d_lev_offsets.resize(0); d_lev_offsets.resize(m_lev_offsets.size()); - Gpu::copyAsync(Gpu::hostToDevice, m_lev_gid_to_bucket.begin(),m_lev_gid_to_bucket.end(),d_lev_gid_to_bucket.begin()); + d_gid_offsets.resize(0); + d_gid_offsets.resize(m_gid_offsets.size()); + + Gpu::copyAsync(Gpu::hostToDevice, m_lev_gid_tid_to_bucket.begin(),m_lev_gid_tid_to_bucket.end(),d_lev_gid_tid_to_bucket.begin()); Gpu::copyAsync(Gpu::hostToDevice, m_lev_offsets.begin(),m_lev_offsets.end(),d_lev_offsets.begin()); + Gpu::copyAsync(Gpu::hostToDevice, m_gid_offsets.begin(),m_gid_offsets.end(),d_gid_offsets.begin()); Gpu::copyAsync(Gpu::hostToDevice, m_bucket_to_pid.begin(),m_bucket_to_pid.end(),d_bucket_to_pid.begin()); Gpu::streamSynchronize(); } bool ParticleBufferMap::isValid (const ParGDBBase* a_gdb) const +{ + return isValid(a_gdb, false, IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))); +} + +bool ParticleBufferMap::isValid (const ParGDBBase* a_gdb, bool a_do_tiling, + const IntVect& a_tile_size) const { if (!m_defined) { return false; } + if (m_do_tiling != a_do_tiling) { return false; } + if (m_tile_size != a_tile_size) { return false; } int num_levs = a_gdb->finestLevel() + 1; if (num_levs != m_ba.size()) { return false; } diff --git a/Src/Particle/AMReX_ParticleCommunication.H b/Src/Particle/AMReX_ParticleCommunication.H index cc148c1ce5c..58496e389e6 100644 --- a/Src/Particle/AMReX_ParticleCommunication.H +++ b/Src/Particle/AMReX_ParticleCommunication.H @@ -7,12 +7,16 @@ #include #include #include +#include #include #include #include #include +#include #include +#include +#include namespace amrex { @@ -59,21 +63,24 @@ struct RedistributeUnpackPolicy struct ParticleCopyOp { - Vector > > m_boxes; - Vector > > m_levels; - Vector > > m_src_indices; - Vector > > m_periodic_shift; + using TileKey = std::pair; + + Vector > > m_boxes; + Vector > > m_levels; + Vector > > m_tiles; + Vector > > m_src_indices; + Vector > > m_periodic_shift; void clear (); void setNumLevels (int num_levels); - void resize (int gid, int lev, int size); + void resize (int gid, int tid, int lev, int size); - [[nodiscard]] int numCopies (int gid, int lev) const + [[nodiscard]] int numCopies (TileKey const& index, int lev) const { if (m_boxes.size() <= lev) { return 0; } - auto mit = m_boxes[lev].find(gid); + auto mit = m_boxes[lev].find(index); return mit == m_boxes[lev].end() ? 0 : int(mit->second.size()); } @@ -82,7 +89,272 @@ struct ParticleCopyOp struct ParticleCopyPlan { - Vector > > m_dst_indices; + using TileKey = std::pair; + + struct StableOrderedAlgorithm {}; + struct TwoPassHostAlgorithm {}; + struct AtomicScatterAlgorithm {}; + +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + struct OmpCopyWork + { + const int* boxes = nullptr; + const int* levs = nullptr; + const int* tiles = nullptr; + int* dst_indices = nullptr; + int num_copies = 0; + }; +#endif + + struct BuildWorkspace + { + explicit BuildWorkspace (int a_num_buckets) + : num_buckets(a_num_buckets) +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + , omp_copy_offsets(1, 0) +#endif + {} + + int num_buckets = 0; + Gpu::HostVector h_box_counts; + +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + Vector omp_thread_box_counts; + Vector omp_copy_work; + Vector omp_copy_offsets; +#endif + }; + + template + void forEachCopyBatch (const PC& pc, const ParticleCopyOp& op, F&& f) + { + const int num_levels = op.numLevels(); + m_dst_indices.resize(num_levels); + auto&& batch_fn = std::forward(f); + + for (int lev = 0; lev < num_levels; ++lev) + { + for (const auto& kv : pc.GetParticles(lev)) + { + auto index = kv.first; + int num_copies = op.numCopies(index, lev); + if (num_copies == 0) { continue; } + + auto& dst_indices = m_dst_indices[lev][index]; + dst_indices.resize(num_copies); + + batch_fn(lev, index, num_copies, dst_indices); + } + } + } + + template + void buildCopies (const PC& pc, const ParticleCopyOp& op, + StableOrderedAlgorithm, BuildWorkspace& workspace, GetBucket const& getBucket) + { + BL_PROFILE("ParticleCopyPlan::buildCopiesStableOrdered"); + + forEachCopyBatch(pc, op, + [&] (int lev, TileKey const& index, int num_copies, Gpu::DeviceVector& dst_indices) + { +#ifdef AMREX_USE_GPU + const Gpu::DeviceVector& d_boxes = op.m_boxes[lev].at(index); + Gpu::HostVector h_boxes(d_boxes.size()); + Gpu::copy(Gpu::deviceToHost, d_boxes.begin(), d_boxes.end(), h_boxes.begin()); + + const Gpu::DeviceVector& d_levs = op.m_levels[lev].at(index); + Gpu::HostVector h_levs(d_levs.size()); + Gpu::copy(Gpu::deviceToHost, d_levs.begin(), d_levs.end(), h_levs.begin()); + + const Gpu::DeviceVector& d_tiles = op.m_tiles[lev].at(index); + Gpu::HostVector h_tiles(d_tiles.size()); + Gpu::copy(Gpu::deviceToHost, d_tiles.begin(), d_tiles.end(), h_tiles.begin()); + + Gpu::HostVector h_dst_indices(num_copies); +#else + const Gpu::DeviceVector& h_boxes = op.m_boxes[lev].at(index); + const Gpu::DeviceVector& h_levs = op.m_levels[lev].at(index); + const Gpu::DeviceVector& h_tiles = op.m_tiles[lev].at(index); + + Gpu::DeviceVector& h_dst_indices = dst_indices; +#endif + for (int i = 0; i < num_copies; ++i) { + int dst_box = h_boxes[i]; + if (dst_box >= 0) { + int dst_tile = h_tiles[i]; + int dst_lev = h_levs[i]; + int dst_index = static_cast(workspace.h_box_counts[getBucket(dst_lev, dst_box, dst_tile)]++); + h_dst_indices[i] = dst_index; + } + } + +#ifdef AMREX_USE_GPU + Gpu::copy(Gpu::hostToDevice, + h_dst_indices.begin(), h_dst_indices.end(), + dst_indices.begin()); +#endif + }); + } + +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + template + void buildCopies (const PC& pc, const ParticleCopyOp& op, + TwoPassHostAlgorithm, BuildWorkspace& workspace, GetBucket const& getBucket) + { + BL_PROFILE("ParticleCopyPlan::buildCopiesTwoPassHost"); + + workspace.omp_thread_box_counts.resize(OpenMP::get_max_threads()*workspace.num_buckets, 0U); + + forEachCopyBatch(pc, op, + [&op, &workspace] (int lev, TileKey const& index, int num_copies, Gpu::DeviceVector& dst_indices) + { + const auto* p_boxes = op.m_boxes[lev].at(index).dataPtr(); + const auto* p_levs = op.m_levels[lev].at(index).dataPtr(); + const auto* p_tiles = op.m_tiles[lev].at(index).dataPtr(); + auto* p_dst_indices = dst_indices.dataPtr(); + + workspace.omp_copy_work.push_back({p_boxes, p_levs, p_tiles, p_dst_indices, num_copies}); + workspace.omp_copy_offsets.push_back(workspace.omp_copy_offsets.back() + num_copies); + }); + + if (workspace.omp_copy_work.empty()) { return; } + + std::fill(workspace.omp_thread_box_counts.begin(), workspace.omp_thread_box_counts.end(), 0U); + auto* p_omp_thread_box_counts = workspace.omp_thread_box_counts.data(); + const auto* p_omp_copy_work = workspace.omp_copy_work.data(); + const auto* p_omp_copy_offsets = workspace.omp_copy_offsets.data(); + const Long total_num_copies = workspace.omp_copy_offsets.back(); + +#pragma omp parallel + { + int thread_num = OpenMP::get_thread_num(); + int num_threads = OpenMP::get_num_threads(); + Long ibegin = thread_num*total_num_copies/num_threads; + Long iend = (thread_num+1)*total_num_copies/num_threads; + auto* p_thread_box_counts = p_omp_thread_box_counts + thread_num*workspace.num_buckets; + + if (ibegin < iend) + { + int iwork = static_cast(std::upper_bound(workspace.omp_copy_offsets.begin(), + workspace.omp_copy_offsets.end(), + ibegin) + - workspace.omp_copy_offsets.begin()) - 1; + while (iwork < static_cast(workspace.omp_copy_work.size()) && + p_omp_copy_offsets[iwork] < iend) + { + auto const& work = p_omp_copy_work[iwork]; + Long global_begin = std::max(ibegin, p_omp_copy_offsets[iwork]); + Long global_end = std::min(iend, p_omp_copy_offsets[iwork+1]); + int local_begin = static_cast(global_begin - p_omp_copy_offsets[iwork]); + int local_end = static_cast(global_end - p_omp_copy_offsets[iwork]); + for (int i = local_begin; i < local_end; ++i) + { + int dst_box = work.boxes[i]; + if (dst_box >= 0) + { + int dst_tile = work.tiles[i]; + int dst_lev = work.levs[i]; + ++p_thread_box_counts[getBucket(dst_lev, dst_box, dst_tile)]; + } + } + ++iwork; + } + } + +#pragma omp barrier +#pragma omp for + for (int ibucket = 0; ibucket < workspace.num_buckets; ++ibucket) + { + unsigned int offset = workspace.h_box_counts[ibucket]; + for (int tid = 0; tid < num_threads; ++tid) + { + auto& count = p_omp_thread_box_counts[tid*workspace.num_buckets + ibucket]; + unsigned int total = count; + count = offset; + offset += total; + } + workspace.h_box_counts[ibucket] = offset; + } + + if (ibegin < iend) + { + int iwork = static_cast(std::upper_bound(workspace.omp_copy_offsets.begin(), + workspace.omp_copy_offsets.end(), + ibegin) + - workspace.omp_copy_offsets.begin()) - 1; + while (iwork < static_cast(workspace.omp_copy_work.size()) && + p_omp_copy_offsets[iwork] < iend) + { + auto const& work = p_omp_copy_work[iwork]; + Long global_begin = std::max(ibegin, p_omp_copy_offsets[iwork]); + Long global_end = std::min(iend, p_omp_copy_offsets[iwork+1]); + int local_begin = static_cast(global_begin - p_omp_copy_offsets[iwork]); + int local_end = static_cast(global_end - p_omp_copy_offsets[iwork]); + for (int i = local_begin; i < local_end; ++i) + { + int dst_box = work.boxes[i]; + if (dst_box >= 0) + { + int dst_tile = work.tiles[i]; + int dst_lev = work.levs[i]; + int bucket = getBucket(dst_lev, dst_box, dst_tile); + work.dst_indices[i] = static_cast(p_thread_box_counts[bucket]++); + } + } + ++iwork; + } + } + } + } +#endif + + template + void buildCopies (const PC& pc, const ParticleCopyOp& op, + AtomicScatterAlgorithm, BuildWorkspace&, GetBucket const& getBucket) + { + BL_PROFILE("ParticleCopyPlan::buildCopiesAtomicScatter"); + + auto* p_dst_box_counts = m_box_counts_d.dataPtr(); + + forEachCopyBatch(pc, op, + [&op, &getBucket, p_dst_box_counts] (int lev, TileKey const& index, + int num_copies, Gpu::DeviceVector& dst_indices) + { + const auto* p_boxes = op.m_boxes[lev].at(index).dataPtr(); + const auto* p_levs = op.m_levels[lev].at(index).dataPtr(); + const auto* p_tiles = op.m_tiles[lev].at(index).dataPtr(); + auto* p_dst_indices = dst_indices.dataPtr(); + + amrex::ParallelFor(num_copies, [=] AMREX_GPU_DEVICE (int i) + { + int dst_box = p_boxes[i]; + if (dst_box >= 0) + { + int dst_tile = p_tiles[i]; + int dst_lev = p_levs[i]; + int dst_index = static_cast(HostDevice::Atomic::FetchAdd( + &p_dst_box_counts[getBucket(dst_lev, dst_box, dst_tile)], 1U)); + p_dst_indices[i] = dst_index; + } + }); + }); + } + + void finalizeBuildBoxCounts (BuildWorkspace const& workspace, bool use_host_box_counters) + { + if (use_host_box_counters) { + Gpu::copy(Gpu::hostToDevice, + workspace.h_box_counts.begin(), workspace.h_box_counts.end(), + m_box_counts_d.begin()); + } + + amrex::Gpu::exclusive_scan(m_box_counts_d.begin(), m_box_counts_d.end(), + m_box_offsets.begin()); + } + +public: + + Vector > > m_dst_indices; Gpu::DeviceVector m_box_counts_d; Gpu::HostVector m_box_counts_h; @@ -91,6 +363,7 @@ struct ParticleCopyPlan Vector m_rcv_box_counts; Vector m_rcv_box_offsets; Vector m_rcv_box_ids; + Vector m_rcv_box_tids; Vector m_rcv_box_pids; Vector m_rcv_box_levs; @@ -135,22 +408,18 @@ struct ParticleCopyPlan const ParticleCopyOp& op, const Vector& int_comp_mask, const Vector& real_comp_mask, - bool local) + int local) { BL_PROFILE("ParticleCopyPlan::build"); - m_local = local; ParmParse pp("particles"); pp.query("do_one_sided_comms", m_do_one_sided_comms); + const int num_buckets = pc.BufferMap().numBuckets(); - const int ngrow = 1; // note - fix - - const int num_levels = op.numLevels(); - const int num_buckets = pc.BufferMap().numBuckets(); - - if (m_local) + m_local = local; + if (local) { - m_neighbor_procs = pc.NeighborProcs(ngrow); + m_neighbor_procs = pc.NeighborProcs(local); } else { @@ -161,69 +430,32 @@ struct ParticleCopyPlan m_box_counts_d.resize(0); m_box_counts_d.resize(num_buckets+1, 0); m_box_offsets.resize(num_buckets+1); - auto* p_dst_box_counts = m_box_counts_d.dataPtr(); - auto getBucket = pc.stableRedistribute() ? pc.BufferMap().getHostBucketFunctor() : pc.BufferMap().getBucketFunctor(); - Gpu::HostVector h_box_counts; - if (pc.stableRedistribute() ) { - h_box_counts.resize(m_box_counts_d.size(), 0); +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + constexpr bool use_host_bucket_counters = true; +#else + constexpr bool use_host_bucket_counters = false; +#endif + BuildWorkspace workspace(num_buckets); + bool use_host_box_counters = pc.stableRedistribute() || use_host_bucket_counters; + if (use_host_box_counters) { + workspace.h_box_counts.resize(m_box_counts_d.size(), 0); } - m_dst_indices.resize(num_levels); - for (int lev = 0; lev < num_levels; ++lev) + if (pc.stableRedistribute()) { - for (const auto& kv : pc.GetParticles(lev)) - { - int gid = kv.first.first; - int num_copies = op.numCopies(gid, lev); - if (num_copies == 0) { continue; } - m_dst_indices[lev][gid].resize(num_copies); - - if (pc.stableRedistribute()) { - const Gpu::DeviceVector& d_boxes = op.m_boxes[lev].at(gid); - Gpu::HostVector h_boxes(d_boxes.size()); - Gpu::copy(Gpu::deviceToHost,d_boxes.begin(),d_boxes.end(),h_boxes.begin()); - - const Gpu::DeviceVector& d_levs = op.m_levels[lev].at(gid); - Gpu::HostVector h_levs(d_levs.size()); - Gpu::copy(Gpu::deviceToHost,d_levs.begin(),d_levs.end(),h_levs.begin()); - - Gpu::HostVector h_dst_indices(num_copies); - for (int i = 0; i < num_copies; ++i) { - int dst_box = h_boxes[i]; - if (dst_box >= 0) { - int dst_lev = h_levs[i]; - int index = static_cast(h_box_counts[getBucket(dst_lev, dst_box)]++); - h_dst_indices[i] = index; - } - } - Gpu::copy(Gpu::hostToDevice,h_dst_indices.begin(),h_dst_indices.end(),m_dst_indices[lev][gid].begin()); - } - else { - const auto* p_boxes = op.m_boxes[lev].at(gid).dataPtr(); - const auto* p_levs = op.m_levels[lev].at(gid).dataPtr(); - auto* p_dst_indices = m_dst_indices[lev][gid].dataPtr(); - AMREX_FOR_1D ( num_copies, i, - { - int dst_box = p_boxes[i]; - if (dst_box >= 0) - { - int dst_lev = p_levs[i]; - int index = static_cast(Gpu::Atomic::Add( - &p_dst_box_counts[getBucket(dst_lev, dst_box)], 1U)); - p_dst_indices[i] = index; - } - }); - } - } + buildCopies(pc, op, StableOrderedAlgorithm{}, workspace, pc.BufferMap().getHostBucketFunctor()); } - - if (pc.stableRedistribute()) { - Gpu::copy(Gpu::hostToDevice,h_box_counts.begin(),h_box_counts.end(),m_box_counts_d.begin()); + else + { +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + buildCopies(pc, op, TwoPassHostAlgorithm{}, workspace, pc.BufferMap().getBucketFunctor()); +#else + buildCopies(pc, op, AtomicScatterAlgorithm{}, workspace, pc.BufferMap().getBucketFunctor()); +#endif } - amrex::Gpu::exclusive_scan(m_box_counts_d.begin(), m_box_counts_d.end(), - m_box_offsets.begin()); + finalizeBuildBoxCounts(workspace, use_host_box_counters); m_box_counts_h.resize(m_box_counts_d.size()); Gpu::copyAsync(Gpu::deviceToHost, m_box_counts_d.begin(), m_box_counts_d.end(), @@ -333,10 +565,10 @@ struct GetSendBufferOffset {} AMREX_FORCE_INLINE AMREX_GPU_DEVICE - Long operator() (int dst_box, int dst_lev, std::size_t psize, int i) const + Long operator() (int dst_box, int dst_tile, int dst_lev, std::size_t psize, int i) const { - int dst_pid = m_get_pid(dst_lev, dst_box); - Long dst_offset = Long(psize)*(m_box_offsets[m_get_bucket(dst_lev, dst_box)] + i); + int dst_pid = m_get_pid(dst_lev, dst_box, dst_tile); + Long dst_offset = Long(psize)*(m_box_offsets[m_get_bucket(dst_lev, dst_box, dst_tile)] + i); dst_offset += Long(m_pad_correction[dst_pid]); return dst_offset; } @@ -381,36 +613,53 @@ void packBuffer (const PC& pc, const ParticleCopyOp& op, const ParticleCopyPlan& const auto plo = pc.Geom(0).ProbLoArray(); const auto phi = pc.Geom(0).ProbHiArray(); const auto is_per = pc.Geom(0).isPeriodicArray(); +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + struct OmpPackWork + { + typename PC::ParticleTileType const* src_tile = nullptr; + const int* boxes = nullptr; + const int* levels = nullptr; + const int* tiles = nullptr; + const int* src_indices = nullptr; + const IntVect* periodic_shift = nullptr; + const int* dst_indices = nullptr; + int num_copies = 0; + }; + Vector omp_pack_work; + Vector omp_pack_offsets(1, 0); +#endif for (int lev = 0; lev < num_levels; ++lev) { auto& plev = pc.GetParticles(lev); for (auto& kv : plev) { - int gid = kv.first.first; - int tid = kv.first.second; - auto index = std::make_pair(gid, tid); - + auto index = kv.first; auto& src_tile = plev.at(index); - const auto& ptd = src_tile.getConstParticleTileData(); - - int num_copies = op.numCopies(gid, lev); + int num_copies = op.numCopies(index, lev); if (num_copies == 0) { continue; } - const auto* p_boxes = op.m_boxes[lev].at(gid).dataPtr(); - const auto* p_levels = op.m_levels[lev].at(gid).dataPtr(); - const auto* p_src_indices = op.m_src_indices[lev].at(gid).dataPtr(); - const auto* p_periodic_shift = op.m_periodic_shift[lev].at(gid).dataPtr(); - const auto* p_dst_indices = plan.m_dst_indices[lev].at(gid).dataPtr(); + const auto* p_boxes = op.m_boxes[lev].at(index).dataPtr(); + const auto* p_levels = op.m_levels[lev].at(index).dataPtr(); + const auto* p_tiles = op.m_tiles[lev].at(index).dataPtr(); + const auto* p_src_indices = op.m_src_indices[lev].at(index).dataPtr(); + const auto* p_periodic_shift = op.m_periodic_shift[lev].at(index).dataPtr(); + const auto* p_dst_indices = plan.m_dst_indices[lev].at(index).dataPtr(); +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + omp_pack_work.push_back({&src_tile, p_boxes, p_levels, p_tiles, + p_src_indices, p_periodic_shift, p_dst_indices, num_copies}); + omp_pack_offsets.push_back(omp_pack_offsets.back() + num_copies); +#else + const auto& ptd = src_tile.getConstParticleTileData(); auto* p_snd_buffer = snd_buffer.dataPtr(); GetSendBufferOffset get_offset(plan, pc.BufferMap()); - - AMREX_FOR_1D ( num_copies, i, + amrex::ParallelFor(num_copies, [=] AMREX_GPU_DEVICE (int i) { int dst_box = p_boxes[i]; if (dst_box >= 0) { + int dst_tile = p_tiles[i]; int dst_lev = p_levels[i]; - auto dst_offset = get_offset(dst_box, dst_lev, psize, p_dst_indices[i]); + auto dst_offset = get_offset(dst_box, dst_tile, dst_lev, psize, p_dst_indices[i]); int src_index = p_src_indices[i]; ptd.packParticleData(p_snd_buffer, src_index, dst_offset, p_comm_real, p_comm_int); @@ -444,8 +693,87 @@ void packBuffer (const PC& pc, const ParticleCopyOp& op, const ParticleCopyPlan& } } }); +#endif } } + +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + if (!omp_pack_work.empty()) + { + auto* p_snd_buffer = snd_buffer.dataPtr(); + GetSendBufferOffset get_offset(plan, pc.BufferMap()); + const Long total_num_copies = omp_pack_offsets.back(); + +#pragma omp parallel + { + int thread_num = OpenMP::get_thread_num(); + int num_threads = OpenMP::get_num_threads(); + Long ibegin = thread_num*total_num_copies/num_threads; + Long iend = (thread_num+1)*total_num_copies/num_threads; + + if (ibegin < iend) + { + int iwork = static_cast(std::upper_bound(omp_pack_offsets.begin(), + omp_pack_offsets.end(), + ibegin) + - omp_pack_offsets.begin()) - 1; + while (iwork < static_cast(omp_pack_work.size()) && + omp_pack_offsets[iwork] < iend) + { + auto const& work = omp_pack_work[iwork]; + auto const& ptd = work.src_tile->getConstParticleTileData(); + Long global_begin = std::max(ibegin, omp_pack_offsets[iwork]); + Long global_end = std::min(iend, omp_pack_offsets[iwork+1]); + int local_begin = static_cast(global_begin - omp_pack_offsets[iwork]); + int local_end = static_cast(global_end - omp_pack_offsets[iwork]); + for (int i = local_begin; i < local_end; ++i) + { + int dst_box = work.boxes[i]; + if (dst_box >= 0) + { + int dst_tile = work.tiles[i]; + int dst_lev = work.levels[i]; + auto dst_offset = get_offset(dst_box, dst_tile, dst_lev, psize, + work.dst_indices[i]); + int src_index = work.src_indices[i]; + ptd.packParticleData(p_snd_buffer, src_index, dst_offset, + p_comm_real, p_comm_int); + + const IntVect& pshift = work.periodic_shift[i]; + bool do_periodic_shift = + AMREX_D_TERM( (is_per[0] && pshift[0] != 0), + || (is_per[1] && pshift[1] != 0), + || (is_per[2] && pshift[2] != 0) ); + + if (do_periodic_shift) + { + ParticleReal pos[AMREX_SPACEDIM]; + Long pos_offset = dst_offset; + if constexpr (PC::ParticleType::is_soa_particle) { + pos_offset += sizeof(uint64_t); + } + amrex::Gpu::memcpy(&pos[0], &p_snd_buffer[pos_offset], + AMREX_SPACEDIM*sizeof(ParticleReal)); + for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) + { + if (! is_per[idim]) { continue; } + if (pshift[idim] > 0) { + pos[idim] += phi[idim] - plo[idim]; + } else if (pshift[idim] < 0) { + pos[idim] -= phi[idim] - plo[idim]; + } + } + amrex::Gpu::memcpy(&p_snd_buffer[pos_offset], &pos[0], + AMREX_SPACEDIM*sizeof(ParticleReal)); + } + } + } + ++iwork; + } + } + } + } +#endif } template omp_unpack_work; + Vector omp_unpack_offsets(1, 0); +#endif + // local unpack int uindex = 0; for (int lev = 0; lev < num_levels; ++lev) @@ -496,21 +838,69 @@ void unpackBuffer (PC& pc, const ParticleCopyPlan& plan, const Buffer& snd_buffe auto& tile = plev[index]; GetSendBufferOffset get_offset(plan, pc.BufferMap()); - auto p_snd_buffer = snd_buffer.dataPtr(); int offset = offsets[uindex]; int size = sizes[uindex]; ++uindex; +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + omp_unpack_work.push_back({&tile, gid, tid, lev, offset, size}); + omp_unpack_offsets.push_back(omp_unpack_offsets.back() + size); +#else + auto p_snd_buffer = snd_buffer.dataPtr(); auto ptd = tile.getParticleTileData(); - AMREX_FOR_1D ( size, i, + amrex::ParallelFor(size, [=] AMREX_GPU_DEVICE (int i) { - auto src_offset = get_offset(gid, lev, psize, i); + auto src_offset = get_offset(gid, tid, lev, psize, i); int dst_index = offset + i; ptd.unpackParticleData(p_snd_buffer, src_offset, dst_index, p_comm_real, p_comm_int); }); +#endif + } + } + +#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU) + if (!omp_unpack_work.empty()) + { + GetSendBufferOffset get_offset(plan, pc.BufferMap()); + auto p_snd_buffer = snd_buffer.dataPtr(); + const Long total_num_copies = omp_unpack_offsets.back(); + +#pragma omp parallel + { + int thread_num = OpenMP::get_thread_num(); + int num_threads = OpenMP::get_num_threads(); + Long ibegin = thread_num*total_num_copies/num_threads; + Long iend = (thread_num+1)*total_num_copies/num_threads; + + if (ibegin < iend) + { + int iwork = static_cast(std::upper_bound(omp_unpack_offsets.begin(), + omp_unpack_offsets.end(), + ibegin) + - omp_unpack_offsets.begin()) - 1; + while (iwork < static_cast(omp_unpack_work.size()) && + omp_unpack_offsets[iwork] < iend) + { + auto const& work = omp_unpack_work[iwork]; + auto ptd = work.tile->getParticleTileData(); + Long global_begin = std::max(ibegin, omp_unpack_offsets[iwork]); + Long global_end = std::min(iend, omp_unpack_offsets[iwork+1]); + int local_begin = static_cast(global_begin - omp_unpack_offsets[iwork]); + int local_end = static_cast(global_end - omp_unpack_offsets[iwork]); + for (int i = local_begin; i < local_end; ++i) + { + auto src_offset = get_offset(work.gid, work.tid, work.lev, psize, i); + int dst_index = work.offset + i; + ptd.unpackParticleData(p_snd_buffer, src_offset, dst_index, + p_comm_real, p_comm_int); + } + ++iwork; + } + } } } +#endif } template (offset + ip) + + static_cast(p_pad_adjust[procindex]); int dst_index = dst_offset + ip; ptd.unpackParticleData(p_rcv_buffer, src_offset, dst_index, p_comm_real, p_comm_int); diff --git a/Src/Particle/AMReX_ParticleCommunication.cpp b/Src/Particle/AMReX_ParticleCommunication.cpp index c6835dd86db..898c3f78612 100644 --- a/Src/Particle/AMReX_ParticleCommunication.cpp +++ b/Src/Particle/AMReX_ParticleCommunication.cpp @@ -8,6 +8,7 @@ void ParticleCopyOp::clear () { m_boxes.resize(0); m_levels.resize(0); + m_tiles.resize(0); m_src_indices.resize(0); m_periodic_shift.resize(0); } @@ -16,20 +17,23 @@ void ParticleCopyOp::setNumLevels (int num_levels) { m_boxes.resize(num_levels); m_levels.resize(num_levels); + m_tiles.resize(num_levels); m_src_indices.resize(num_levels); m_periodic_shift.resize(num_levels); } -void ParticleCopyOp::resize (int gid, int lev, int size) +void ParticleCopyOp::resize (int gid, int tid, int lev, int size) { if (lev >= m_boxes.size()) { setNumLevels(lev+1); } - m_boxes[lev][gid].resize(size); - m_levels[lev][gid].resize(size); - m_src_indices[lev][gid].resize(size); - m_periodic_shift[lev][gid].resize(size); + const auto index = std::make_pair(gid, tid); + m_boxes[lev][index].resize(size); + m_levels[lev][index].resize(size); + m_tiles[lev][index].resize(size); + m_src_indices[lev][index].resize(size); + m_periodic_shift[lev][index].resize(size, IntVect(AMREX_D_DECL(0,0,0))); } void ParticleCopyPlan::clear () @@ -42,6 +46,7 @@ void ParticleCopyPlan::clear () m_rcv_box_counts.clear(); m_rcv_box_offsets.clear(); m_rcv_box_ids.clear(); + m_rcv_box_tids.clear(); m_rcv_box_pids.clear(); m_rcv_box_levs.clear(); } @@ -79,6 +84,7 @@ void ParticleCopyPlan::buildMPIStart (const ParticleContainerBase& pc, const Par for (auto bucket : box_buffer_indices) { int dst = map.bucketToGrid(bucket); + int tid = map.bucketToTile(bucket); int lev = map.bucketToLevel(bucket); AMREX_ASSERT(m_box_counts_h[bucket] <= static_cast(std::numeric_limits::max())); int npart = static_cast(m_box_counts_h[bucket]); @@ -87,9 +93,10 @@ void ParticleCopyPlan::buildMPIStart (const ParticleContainerBase& pc, const Par if (i == MyProc) { continue; } snd_data[i].push_back(npart); snd_data[i].push_back(dst); + snd_data[i].push_back(tid); snd_data[i].push_back(lev); snd_data[i].push_back(MyProc); - nbytes += 4*sizeof(int); + nbytes += 5*sizeof(int); } m_Snds[i] = nbytes; m_NumSnds += nbytes; @@ -229,17 +236,19 @@ void ParticleCopyPlan::buildMPIFinish (const ParticleBufferMap& map) // NOLINT(r m_rcv_box_offsets.resize(0); m_rcv_box_counts.resize(0); m_rcv_box_ids.resize(0); + m_rcv_box_tids.resize(0); m_rcv_box_levs.resize(0); m_rcv_box_pids.resize(0); m_rcv_box_offsets.push_back(0); - for (int i = 0, N = static_cast(m_rcv_data.size()); i < N; i+=4) + for (int i = 0, N = static_cast(m_rcv_data.size()); i < N; i+=5) { m_rcv_box_counts.push_back(m_rcv_data[i]); - AMREX_ASSERT(ParallelContext::MyProcSub() == map.procID(m_rcv_data[i+1], m_rcv_data[i+2])); + AMREX_ASSERT(ParallelContext::MyProcSub() == map.procID(m_rcv_data[i+1], m_rcv_data[i+2], m_rcv_data[i+3])); m_rcv_box_ids.push_back(m_rcv_data[i+1]); - m_rcv_box_levs.push_back(m_rcv_data[i+2]); - m_rcv_box_pids.push_back(m_rcv_data[i+3]); + m_rcv_box_tids.push_back(m_rcv_data[i+2]); + m_rcv_box_levs.push_back(m_rcv_data[i+3]); + m_rcv_box_pids.push_back(m_rcv_data[i+4]); m_rcv_box_offsets.push_back(m_rcv_box_offsets.back() + m_rcv_box_counts.back()); } } @@ -251,7 +260,7 @@ void ParticleCopyPlan::buildMPIFinish (const ParticleBufferMap& map) // NOLINT(r const auto Cnt = m_Rcvs[Who]/sizeof(int); Long nparticles = 0; - for (auto i = offset; i < offset + Cnt; i +=4) + for (auto i = offset; i < offset + Cnt; i +=5) { nparticles += m_rcv_data[i]; } diff --git a/Src/Particle/AMReX_ParticleContainer.H b/Src/Particle/AMReX_ParticleContainer.H index e7e1feb8f5e..128eb1a7d16 100644 --- a/Src/Particle/AMReX_ParticleContainer.H +++ b/Src/Particle/AMReX_ParticleContainer.H @@ -1283,8 +1283,8 @@ public: void RedistributeCPU (int lev_min = 0, int lev_max = -1, int nGrow = 0, int local=0, bool remove_negative=true); - void RedistributeGPU (int lev_min = 0, int lev_max = -1, int nGrow = 0, int local=0, - bool remove_negative=true); + void Redistribute_impl (int lev_min = 0, int lev_max = -1, int nGrow = 0, int local=0, + bool remove_negative=true); void ReserveForRedistribute (ParticleCopyPlan const& plan); @@ -1546,6 +1546,17 @@ private: void RedistributeMPI (std::map >& not_ours, int lev_min = 0, int lev_max = 0, int nGrow = 0, int local=0); + AMREX_FORCE_INLINE + int hostPartitionTile (ParticleTileType& src_tile, + int lev, int gid, int tid, + int lev_min, int lev_max, int nGrow, int local, + bool remove_negative, int myproc, + Gpu::DeviceVector& boxes, + Gpu::DeviceVector& levels, + Gpu::DeviceVector& tiles, + Gpu::DeviceVector& src_indices, + Gpu::DeviceVector& periodic_shift); + template void locateParticle (P& p, ParticleLocData& pld, int lev_min, int lev_max, int nGrow, int local_grid=-1) const; diff --git a/Src/Particle/AMReX_ParticleContainerBase.cpp b/Src/Particle/AMReX_ParticleContainerBase.cpp index 5a93871e012..410607cf212 100644 --- a/Src/Particle/AMReX_ParticleContainerBase.cpp +++ b/Src/Particle/AMReX_ParticleContainerBase.cpp @@ -74,9 +74,9 @@ ParticleContainerBase::defineBufferMap () const { BL_PROFILE("ParticleContainer::defineBufferMap"); - if (! m_buffer_map.isValid(GetParGDB())) + if (! m_buffer_map.isValid(GetParGDB(), do_tiling, tile_size)) { - m_buffer_map.define(GetParGDB()); + m_buffer_map.define(GetParGDB(), do_tiling, tile_size); } } diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H index e916c4100b2..858400ee9dd 100644 --- a/Src/Particle/AMReX_ParticleContainerI.H +++ b/Src/Particle/AMReX_ParticleContainerI.H @@ -812,7 +812,7 @@ struct FilterVirt int operator() (const SrcData& src, int src_i) const noexcept { auto iv = getParticleCell(src, src_i, m_plo, m_dxi, m_domain); - return (m_assign_buffer_grid(iv)!=-1); + return (m_assign_buffer_grid(iv).first != -1); } }; @@ -894,7 +894,7 @@ ParticleContainer_impl> locator; - locator.build(buffer, geom); + locator.build(buffer, geom, do_tiling, tile_size); AssignGrid> assign_buffer_grid = locator.getGridAssignor(); amrex::ParticleToMesh(*this, mf, level, @@ -904,7 +904,7 @@ ParticleContainer_impl const& dxi_loc) { auto iv = getParticleCell(p, plo_loc, dxi_loc, domain); - if(assign_buffer_grid(iv)==-1) + if(assign_buffer_grid(iv).first == -1) { //Ordering may make this not unique for (int i = 0; i < NArrayReal; ++i) @@ -938,7 +938,7 @@ ParticleContainer_impl(tup_min); const auto p_boxes_max = amrex::get<0>(tup_max); - const auto p_levs_max = amrex::get<1>(tup_max); + const auto p_levs_max = amrex::get<2>(tup_max); return p_boxes_max >=0 && p_boxes == m_gid && p_levs_max == m_lev_max; } }; @@ -1101,8 +1101,8 @@ ParticleContainer_impl class Allocator, class CellAssignor> +AMREX_FORCE_INLINE +int +ParticleContainer_impl +::hostPartitionTile (ParticleTileType& src_tile, + int lev, int gid, int tid, + int lev_min, int lev_max, int nGrow, int local, + bool remove_negative, int myproc, + Gpu::DeviceVector& boxes, + Gpu::DeviceVector& levels, + Gpu::DeviceVector& tiles, + Gpu::DeviceVector& src_indices, + Gpu::DeviceVector& periodic_shift) +{ + ParticleLocData pld; + auto ptd = src_tile.getParticleTileData(); + int num_move = 0; + + Long last = src_tile.numParticles() - 1; + Long pindex = 0; + int who = -1; + while (pindex <= last) { + decltype(auto) p = ptd[pindex]; + + // remove invalid if needed + if (!ptd.id(pindex).is_valid()) { + if (remove_negative) { + copyParticle(ptd, ptd, last, pindex); + correctCellVectors(last, pindex, gid, p); + --last; + } else { + ++pindex; + } + continue; + } + + // Fast path: if on the finest level, try the last good location first + // If not on finest level, we need to do the full locate because even if + // the last assignment works, the particle might belong on a finer level. + if (lev == lev_max && pld.m_tile == tid && pld.m_grid == gid && pld.m_lev == lev && who == myproc) { + const auto iv = Index(p, lev); + if (pld.m_tilebox.contains(iv)) { + ++pindex; + continue; // don't need correctCellVectors or particlePostLocate if it stays where it is + } + } + + // full locate + locateParticle(p, pld, lev_min, lev_max, nGrow, local ? gid : -1); + particlePostLocate(p, pld, lev); + + // particle may have been invalidated, so check if we need to remove again. + if (!ptd.id(pindex).is_valid()) { + copyParticle(ptd, ptd, last, pindex); + correctCellVectors(last, pindex, gid, p); + --last; + continue; + } + + // flag particle for move if needed + who = BufferMap().procID(pld.m_grid, pld.m_tile, pld.m_lev); + if (pld.m_lev != lev || pld.m_grid != gid || pld.m_tile != tid || who != myproc) { + // the particle is valid but needs to be sent somewhere else + swapParticle(ptd, ptd, last, pindex); + correctCellVectors(last, pindex, gid, p); + boxes[num_move] = pld.m_grid; + levels[num_move] = pld.m_lev; + tiles[num_move] = pld.m_tile; + src_indices[num_move] = static_cast(last); + ++num_move; + --last; + continue; + } + + ++pindex; // if here, particle stays + } + + boxes.resize(num_move); + levels.resize(num_move); + tiles.resize(num_move); + src_indices.resize(num_move); + periodic_shift.resize(num_move); + + return static_cast(pindex); +} + // -// The GPU implementation of Redistribute +// Shared implementation of Redistribute // template class Allocator, class CellAssignor> void ParticleContainer_impl -::RedistributeGPU (int lev_min, int lev_max, int nGrow, int local, bool remove_negative) +::Redistribute_impl (int lev_min, int lev_max, int nGrow, int local, bool remove_negative) { -#ifdef AMREX_USE_GPU - if (local) { AMREX_ASSERT(numParticlesOutOfRange(*this, lev_min, lev_max, local) == 0); } - // sanity check - AMREX_ALWAYS_ASSERT(do_tiling == false); - - BL_PROFILE("ParticleContainer::RedistributeGPU()"); + BL_PROFILE("ParticleContainer::Redistribute_impl()"); BL_PROFILE_VAR_NS("Redistribute_partition", blp_partition); int theEffectiveFinestLevel = m_gdb->finestLevel(); @@ -1399,69 +1470,189 @@ ParticleContainer_impldefineBufferMap(); - if (! m_particle_locator.isValid(GetParGDB())) { m_particle_locator.build(GetParGDB()); } +#ifndef AMREX_USE_GPU + if (local > 0) { BuildRedistributeMask(0, local); } +#else + if (! m_particle_locator.isValid(GetParGDB(), do_tiling, tile_size)) { m_particle_locator.build(GetParGDB(), do_tiling, tile_size); } m_particle_locator.setGeometry(GetParGDB()); - auto assign_grid = m_particle_locator.getGridAssignor(); +#endif BL_PROFILE_VAR_START(blp_partition); ParticleCopyOp op; int num_levels = finest_lev_particles + 1; op.setNumLevels(num_levels); - Vector > new_sizes(num_levels); + Vector, int> > new_sizes(num_levels); +#ifndef AMREX_USE_GPU + const int myproc = ParallelContext::MyProcSub(); +#endif +#ifdef AMREX_USE_GPU + auto assign_grid = m_particle_locator.getGridAssignor(); const auto plo = Geom(0).ProbLoArray(); const auto phi = Geom(0).ProbHiArray(); const auto rlo = Geom(0).ProbLoArrayInParticleReal(); const auto rhi = Geom(0).ProbHiArrayInParticleReal(); const auto is_per = Geom(0).isPeriodicArray(); - for (int lev = lev_min; lev <= finest_lev_particles; ++lev) - { - auto& plev = m_particles[lev]; - for (auto& kv : plev) - { - int gid = kv.first.first; - int tid = kv.first.second; - auto index = std::make_pair(gid, tid); +#endif - auto& src_tile = plev[index]; - const size_t np = src_tile.numParticles(); +#if defined(AMREX_USE_OMP) || defined(AMREX_USE_GPU) + Vector > grid_tile_ids; + Vector ptile_ptrs; + Vector plevs; + Vector new_size_ptrs; +#ifndef AMREX_USE_GPU + Vector*> box_ptrs; + Vector*> level_ptrs; + Vector*> tile_ptrs; + Vector*> src_index_ptrs; + Vector*> periodic_shift_ptrs; +#endif + std::size_t num_tiles = 0; + for (int lev = lev_min; lev <= finest_lev_particles; ++lev) { + for (auto const& kv : m_particles[lev]) { + if (kv.second.numParticles() != 0) { + ++num_tiles; + } + } + } + grid_tile_ids.reserve(num_tiles); + ptile_ptrs.reserve(num_tiles); + plevs.reserve(num_tiles); + new_size_ptrs.reserve(num_tiles); +#ifndef AMREX_USE_GPU + box_ptrs.reserve(num_tiles); + level_ptrs.reserve(num_tiles); + tile_ptrs.reserve(num_tiles); + src_index_ptrs.reserve(num_tiles); + periodic_shift_ptrs.reserve(num_tiles); +#endif + for (int lev = lev_min; lev <= finest_lev_particles; lev++) { + auto& pmap = m_particles[lev]; + for (auto& kv : pmap) + { + const auto np = kv.second.numParticles(); + if (np == 0) { continue; } - int num_stay = partitionParticlesByDest(src_tile, assign_grid, - std::forward(CellAssignor{}), - BufferMap(), - plo, phi, rlo, rhi, is_per, lev, gid, tid, - lev_min, lev_max, nGrow, remove_negative); + grid_tile_ids.push_back(kv.first); + ptile_ptrs.push_back(&(kv.second)); + plevs.push_back(lev); + auto index = std::make_pair(kv.first.first, kv.first.second); + auto& new_size = new_sizes[lev][index]; + new_size = 0; + new_size_ptrs.push_back(&new_size); +#ifndef AMREX_USE_GPU + op.resize(kv.first.first, kv.first.second, lev, static_cast(np)); + auto& boxes = op.m_boxes[lev][index]; + auto& levels = op.m_levels[lev][index]; + auto& tiles = op.m_tiles[lev][index]; + auto& src_indices = op.m_src_indices[lev][index]; + auto& periodic_shift = op.m_periodic_shift[lev][index]; + box_ptrs.push_back(&boxes); + level_ptrs.push_back(&levels); + tile_ptrs.push_back(&tiles); + src_index_ptrs.push_back(&src_indices); + periodic_shift_ptrs.push_back(&periodic_shift); +#endif + } + } +#endif - int num_move = np - num_stay; - new_sizes[lev][gid] = num_stay; - op.resize(gid, lev, num_move); +#if defined(AMREX_USE_OMP) || defined(AMREX_USE_GPU) +#ifdef AMREX_USE_OMP +#pragma omp parallel for +#endif + for (int pmap_it = 0; pmap_it < static_cast(ptile_ptrs.size()); ++pmap_it) + { + int lev = plevs[pmap_it]; + int gid = grid_tile_ids[pmap_it].first; + int tid = grid_tile_ids[pmap_it].second; + auto& src_tile = *ptile_ptrs[pmap_it]; + const size_t np = src_tile.numParticles(); + if (np == 0) {continue;} - auto p_boxes = op.m_boxes[lev][gid].dataPtr(); - auto p_levs = op.m_levels[lev][gid].dataPtr(); - auto p_src_indices = op.m_src_indices[lev][gid].dataPtr(); - auto p_periodic_shift = op.m_periodic_shift[lev][gid].dataPtr(); - auto ptd = src_tile.getParticleTileData(); +#ifndef AMREX_USE_GPU + auto& boxes = *box_ptrs[pmap_it]; + auto& levels = *level_ptrs[pmap_it]; + auto& tiles = *tile_ptrs[pmap_it]; + auto& src_indices = *src_index_ptrs[pmap_it]; + auto& periodic_shift = *periodic_shift_ptrs[pmap_it]; + *new_size_ptrs[pmap_it] = hostPartitionTile(src_tile, + lev, gid, tid, + lev_min, lev_max, nGrow, local, + remove_negative, myproc, + boxes, levels, tiles, + src_indices, periodic_shift); +#else // GPU algorithm + int num_stay = partitionParticlesByDest(src_tile, assign_grid, + std::forward(CellAssignor{}), + BufferMap(), + plo, phi, rlo, rhi, is_per, lev, gid, tid, + lev_min, lev_max, nGrow, remove_negative); + + int num_move = np - num_stay; + *new_size_ptrs[pmap_it] = num_stay; + op.resize(gid, tid, lev, num_move); + + auto index = std::make_pair(gid, tid); + auto* p_boxes = op.m_boxes[lev][index].dataPtr(); + auto* p_levs = op.m_levels[lev][index].dataPtr(); + auto* p_tiles = op.m_tiles[lev][index].dataPtr(); + auto* p_src_indices = op.m_src_indices[lev][index].dataPtr(); + auto* p_periodic_shift = op.m_periodic_shift[lev][index].dataPtr(); + auto ptd = src_tile.getParticleTileData(); + + amrex::ParallelFor(num_move, [=] AMREX_GPU_DEVICE (int i) + { + const auto p = ptd[i + num_stay]; - AMREX_FOR_1D ( num_move, i, + if (!p.id().is_valid()) + { + p_boxes[i] = -1; + p_tiles[i] = -1; + p_levs[i] = -1; + } + else { - const auto p = ptd[i + num_stay]; + const auto tup = assign_grid(p, lev_min, lev_max, nGrow, + std::forward(CellAssignor{})); + p_boxes[i] = amrex::get<0>(tup); + p_tiles[i] = amrex::get<1>(tup); + p_levs[i] = amrex::get<2>(tup); + } + p_periodic_shift[i] = IntVect(AMREX_D_DECL(0,0,0)); + p_src_indices[i] = i+num_stay; + }); +#endif + } +#else + for (int lev = lev_min; lev <= finest_lev_particles; ++lev) { + auto& pmap = m_particles[lev]; + for (auto& kv : pmap) + { + auto& src_tile = kv.second; + const auto np = src_tile.numParticles(); + if (np == 0) { continue; } - if (!p.id().is_valid()) - { - p_boxes[i] = -1; - p_levs[i] = -1; - } - else - { - const auto tup = assign_grid(p, lev_min, lev_max, nGrow, - std::forward(CellAssignor{})); - p_boxes[i] = amrex::get<0>(tup); - p_levs[i] = amrex::get<1>(tup); - } - p_periodic_shift[i] = IntVect(AMREX_D_DECL(0,0,0)); - p_src_indices[i] = i+num_stay; - }); + int gid = kv.first.first; + int tid = kv.first.second; + auto index = std::make_pair(gid, tid); + auto& new_size = new_sizes[lev][index]; + new_size = 0; + + op.resize(gid, tid, lev, static_cast(np)); + auto& boxes = op.m_boxes[lev][index]; + auto& levels = op.m_levels[lev][index]; + auto& tiles = op.m_tiles[lev][index]; + auto& src_indices = op.m_src_indices[lev][index]; + auto& periodic_shift = op.m_periodic_shift[lev][index]; + new_size = hostPartitionTile(src_tile, + lev, gid, tid, + lev_min, lev_max, nGrow, local, + remove_negative, myproc, + boxes, levels, tiles, + src_indices, periodic_shift); } } +#endif BL_PROFILE_VAR_STOP(blp_partition); ParticleCopyPlan plan; @@ -1490,7 +1681,7 @@ ParticleContainer_implDefineAndReturnParticleTile(lev, gid, tid); - int num_copies = plan.m_box_counts_h[this->BufferMap().gridAndLevToBucket(gid, lev)]; + int num_copies = plan.m_box_counts_h[this->BufferMap().gridAndTileAndLevToBucket(gid, tid, lev)]; if (num_copies > 0) { addsizes[&tile] += num_copies; } @@ -1576,12 +1764,11 @@ ParticleContainer_impl 0) { - AMREX_ALWAYS_ASSERT(do_tiling == false); for (int i = 0, N = int(plan.m_rcv_box_counts.size()); i < N; ++i) { int copy_size = plan.m_rcv_box_counts[i]; int lev = plan.m_rcv_box_levs[i]; int gid = plan.m_rcv_box_ids[i]; - int tid = 0; // It's always 0 because this function is for RedistributeGPU only and the tiling is off. + int tid = plan.m_rcv_box_tids[i]; auto& tile = this->DefineAndReturnParticleTile(lev, gid, tid); addsizes[&tile] += copy_size; } @@ -1590,669 +1777,6 @@ ParticleContainer_impl class Allocator, class CellAssignor> -void -ParticleContainer_impl -::RedistributeCPU (int lev_min, int lev_max, int nGrow, int local, bool remove_negative) -{ - BL_PROFILE("ParticleContainer::RedistributeCPU()"); - - const int MyProc = ParallelContext::MyProcSub(); - auto strttime = amrex::second(); - - if (local > 0) { BuildRedistributeMask(0, local); } - - // On startup there are cases where Redistribute() could be called - // with a given finestLevel() where that AmrLevel has yet to be defined. - int theEffectiveFinestLevel = m_gdb->finestLevel(); - - while (!m_gdb->LevelDefined(theEffectiveFinestLevel)) { - theEffectiveFinestLevel--; - } - - if (int(m_particles.size()) < theEffectiveFinestLevel+1) { - if (Verbose()) { - amrex::Print() << "ParticleContainer::Redistribute() resizing containers from " - << m_particles.size() << " to " - << theEffectiveFinestLevel + 1 << '\n'; - } - m_particles.resize(theEffectiveFinestLevel+1); - m_dummy_mf.resize(theEffectiveFinestLevel+1); - } - - // It is important to do this even if we don't have more levels because we may have changed the - // grids at this level in a regrid. - for (int lev = 0; lev < theEffectiveFinestLevel+1; ++lev) { - RedefineDummyMF(lev); - } - - int finest_lev_particles; - if (lev_max == -1) { - lev_max = theEffectiveFinestLevel; - finest_lev_particles = m_particles.size() - 1; - } else { - finest_lev_particles = lev_max; - } - AMREX_ASSERT(lev_max <= finestLevel()); - - // This will hold the valid particles that go to another process - std::map > not_ours; - - int num_threads = OpenMP::get_max_threads(); - - // these are temporary buffers for each thread - std::map > > tmp_remote; - Vector, Vector > > ptile_local; - ptile_local.resize(theEffectiveFinestLevel+1); - - // we resize these buffers outside the parallel region - for (int lev = lev_min; lev <= lev_max; lev++) { - for (MFIter mfi(*m_dummy_mf[lev], this->do_tiling ? this->tile_size : IntVect::TheZeroVector()); - mfi.isValid(); ++mfi) { - auto index = std::make_pair(mfi.index(), mfi.LocalTileIndex()); - ptile_local[lev][index].resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - ptile_local[lev][index][t].define(m_num_runtime_real, m_num_runtime_int, - &m_soa_rdata_names, &m_soa_idata_names, arena()); - } - } - } - if (local) { - for (int i = 0; i < neighbor_procs.size(); ++i) { - tmp_remote[neighbor_procs[i]].resize(num_threads); - } - } else { - for (int i = 0; i < ParallelContext::NProcsSub(); ++i) { - tmp_remote[i].resize(num_threads); - } - } - - // first pass: for each tile in parallel, in each thread copies the particles that - // need to be moved into it's own, temporary buffer. - for (int lev = lev_min; lev <= finest_lev_particles; lev++) { - auto& pmap = m_particles[lev]; - - Vector > grid_tile_ids; - Vector ptile_ptrs; - for (auto& kv : pmap) - { - grid_tile_ids.push_back(kv.first); - ptile_ptrs.push_back(&(kv.second)); - } - -#ifdef AMREX_USE_OMP -#pragma omp parallel for -#endif - for (int pmap_it = 0; pmap_it < static_cast(ptile_ptrs.size()); ++pmap_it) - { - int thread_num = OpenMP::get_thread_num(); - int grid = grid_tile_ids[pmap_it].first; - int tile = grid_tile_ids[pmap_it].second; - - unsigned npart = ptile_ptrs[pmap_it]->numParticles(); - ParticleLocData pld; - - auto particle_tile = ptile_ptrs[pmap_it]; - auto ptd = particle_tile->getParticleTileData(); - - if (npart != 0) { - Long last = npart - 1; - Long pindex = 0; - while (pindex <= last) { - decltype(auto) p = ptd[pindex]; - - if ((remove_negative == false) && (!ptd.id(pindex).is_valid())) { - ++pindex; - continue; - } - - if (!ptd.id(pindex).is_valid()) { - copyParticle(ptd, ptd, last, pindex); - correctCellVectors(last, pindex, grid, p); - --last; - continue; - } - - locateParticle(p, pld, lev_min, lev_max, nGrow, local ? grid : -1); - - particlePostLocate(p, pld, lev); - - if (!ptd.id(pindex).is_valid()) { - copyParticle(ptd, ptd, last, pindex); - correctCellVectors(last, pindex, grid, p); - --last; - continue; - } - - const int who = ParallelContext::global_to_local_rank(ParticleDistributionMap(pld.m_lev)[pld.m_grid]); - if (who == MyProc) { - if (pld.m_lev != lev || pld.m_grid != grid || pld.m_tile != tile) { - // We own it but must shift it to another place. - auto index = std::make_pair(pld.m_grid, pld.m_tile); - AMREX_ASSERT(ptile_local[pld.m_lev][index].size() == num_threads); - - auto& dst = ptile_local[pld.m_lev][index][thread_num]; - auto old_size = dst.size(); - auto new_size = old_size + 1; - dst.resize(new_size, GrowthStrategy::Geometric); - auto dst_ptd = dst.getParticleTileData(); - - copyParticle(dst_ptd, ptd, pindex, old_size); - - ptd.id(pindex).make_invalid(); // Invalidate the particle - } - } else { - auto& particles_to_send = tmp_remote[who][thread_num]; - auto old_size = particles_to_send.size(); - auto new_size = old_size + superparticle_size; - particles_to_send.resize(new_size); - - ptd.packParticleData(particles_to_send.data(), pindex, old_size, - h_redistribute_real_comp.data(), h_redistribute_int_comp.data()); - - ptd.id(pindex).make_invalid(); // Invalidate the particle - } - - if (!ptd.id(pindex).is_valid()) { - copyParticle(ptd, ptd, last, pindex); - correctCellVectors(last, pindex, grid, p); - --last; - continue; - } - - ++pindex; - } - - Long tot_npart = particle_tile->numTotalParticles(); - - if (last != npart - 1) { - pindex = last + 1; - last = npart; - while (last < tot_npart) { - copyParticle(ptd, ptd, last, pindex); - ++pindex; - ++last; - } - particle_tile->resize(pindex); - } - } - } - } - - for (int lev = lev_min; lev <= lev_max; lev++) { - particle_detail::clearEmptyEntries(m_particles[lev]); - } - - // Second pass - for each tile in parallel, collect the particles we are owed from all thread's buffers. - for (int lev = lev_min; lev <= lev_max; lev++) { - - Vector > grid_tile_ids; - - // we need to create any missing map entries in serial here - for (auto pmap_it = ptile_local[lev].begin(); pmap_it != ptile_local[lev].end(); pmap_it++) - { - DefineAndReturnParticleTile(lev, pmap_it->first.first, pmap_it->first.second); - grid_tile_ids.push_back(pmap_it->first); - } - -#ifdef AMREX_USE_OMP -#pragma omp parallel for -#endif - for (int pit = 0; pit < static_cast(grid_tile_ids.size()); ++pit) // NOLINT(modernize-loop-convert) - { - auto index = grid_tile_ids[pit]; - auto& dst_ptile = ParticlesAt(lev, index.first, index.second); - auto& src_ptile = ptile_local[lev][index]; - - for (int i = 0; i < num_threads; ++i) { - - Long to_copy = src_ptile[i].numParticles(); - Long old_size = dst_ptile.numTotalParticles(); - Long new_size = old_size + to_copy; - dst_ptile.resize(new_size); - - auto dst_ptd = dst_ptile.getParticleTileData(); - auto src_ptd = src_ptile[i].getParticleTileData(); - - for (Long j = 0; j < to_copy; ++j) { - copyParticle(dst_ptd, src_ptd, j, j + old_size); - } - - src_ptile[i].resize(0); - src_ptile[i].shrink_to_fit(); - } - } - } - - for (auto& map_it : tmp_remote) { - int who = map_it.first; - not_ours[who]; - } - - Vector dest_proc_ids; - Vector >* > pbuff_ptrs; - for (auto& kv : tmp_remote) - { - dest_proc_ids.push_back(kv.first); - pbuff_ptrs.push_back(&(kv.second)); - } - -#ifdef AMREX_USE_OMP -#pragma omp parallel for -#endif - for (int pmap_it = 0; pmap_it < static_cast(pbuff_ptrs.size()); ++pmap_it) - { - int who = dest_proc_ids[pmap_it]; - Vector >& tmp = *(pbuff_ptrs[pmap_it]); - for (int i = 0; i < num_threads; ++i) { - not_ours[who].insert(not_ours[who].end(), tmp[i].begin(), tmp[i].end()); - tmp[i].erase(tmp[i].begin(), tmp[i].end()); - } - } - - particle_detail::clearEmptyEntries(not_ours); - - if (int(m_particles.size()) > theEffectiveFinestLevel+1) { - // Looks like we lost an AmrLevel on a regrid. - if (m_verbose > 0) { - amrex::Print() << "ParticleContainer::Redistribute() resizing m_particles from " - << m_particles.size() << " to " << theEffectiveFinestLevel+1 << '\n'; - } - AMREX_ASSERT(int(m_particles.size()) >= 2); - - m_particles.resize(theEffectiveFinestLevel + 1); - m_dummy_mf.resize(theEffectiveFinestLevel + 1); - } - - if (ParallelContext::NProcsSub() == 1) { - AMREX_ASSERT(not_ours.empty()); - } - else { - RedistributeMPI(not_ours, lev_min, lev_max, nGrow, local); - } - - AMREX_ASSERT(OK(lev_min, lev_max, nGrow)); - - if (m_verbose > 0) { - auto stoptime = amrex::second() - strttime; - - ByteSpread(); - -#ifdef AMREX_LAZY - Lazy::QueueReduction( [=] () mutable { -#endif - ParallelReduce::Max(stoptime, ParallelContext::IOProcessorNumberSub(), - ParallelContext::CommunicatorSub()); - - amrex::Print() << "ParticleContainer::Redistribute() time: " << stoptime << "\n\n"; -#ifdef AMREX_LAZY - }); -#endif - } -} - -template class Allocator, class CellAssignor> -void -ParticleContainer_impl:: -RedistributeMPI (std::map >& not_ours, - int lev_min, int lev_max, int nGrow, int local) -{ - BL_PROFILE("ParticleContainer::RedistributeMPI()"); - BL_PROFILE_VAR_NS("RedistributeMPI_locate", blp_locate); - BL_PROFILE_VAR_NS("RedistributeMPI_copy", blp_copy); - -#ifdef AMREX_USE_MPI - - using buffer_type = unsigned long long; - - std::map > mpi_snd_data; - for (const auto& kv : not_ours) - { - auto nbt = (kv.second.size() + sizeof(buffer_type)-1)/sizeof(buffer_type); - mpi_snd_data[kv.first].resize(nbt); - std::memcpy((char*) mpi_snd_data[kv.first].data(), kv.second.data(), kv.second.size()); - } - - const int NProcs = ParallelContext::NProcsSub(); - const int NNeighborProcs = neighbor_procs.size(); - - // We may now have particles that are rightfully owned by another CPU. - Vector Snds(NProcs, 0), Rcvs(NProcs, 0); // bytes! - - Long NumSnds = 0; - if (local > 0) - { - AMREX_ALWAYS_ASSERT(lev_min == 0); - AMREX_ALWAYS_ASSERT(lev_max == 0); - BuildRedistributeMask(0, local); - NumSnds = doHandShakeLocal(not_ours, neighbor_procs, Snds, Rcvs); - } - else - { - NumSnds = doHandShake(not_ours, Snds, Rcvs); - } - - const int SeqNum = ParallelDescriptor::SeqNum(); - - if ((! local) && NumSnds == 0) { - return; // There's no parallel work to do. - } - - if (local) - { - Long tot_snds_this_proc = 0; - Long tot_rcvs_this_proc = 0; - for (int i = 0; i < NNeighborProcs; ++i) { - tot_snds_this_proc += Snds[neighbor_procs[i]]; - tot_rcvs_this_proc += Rcvs[neighbor_procs[i]]; - } - if ( (tot_snds_this_proc == 0) && (tot_rcvs_this_proc == 0) ) { - return; // There's no parallel work to do. - } - } - - Vector RcvProc; - Vector rOffset; // Offset (in bytes) in the receive buffer - - std::size_t TotRcvInts = 0; - std::size_t TotRcvBytes = 0; - for (int i = 0; i < NProcs; ++i) { - if (Rcvs[i] > 0) { - RcvProc.push_back(i); - rOffset.push_back(TotRcvInts); - TotRcvBytes += Rcvs[i]; - auto nbt = (Rcvs[i] + sizeof(buffer_type)-1)/sizeof(buffer_type); - TotRcvInts += nbt; - } - } - - const auto nrcvs = static_cast(RcvProc.size()); - Vector stats(nrcvs); - Vector rreqs(nrcvs); - - // Allocate data for rcvs as one big chunk. - Vector recvdata(TotRcvInts); - - // Post receives. - for (int i = 0; i < nrcvs; ++i) { - const auto Who = RcvProc[i]; - const auto offset = rOffset[i]; - const auto Cnt = (Rcvs[Who] + sizeof(buffer_type)-1)/sizeof(buffer_type); - AMREX_ASSERT(Cnt > 0); - AMREX_ASSERT(Cnt < size_t(std::numeric_limits::max())); - AMREX_ASSERT(Who >= 0 && Who < NProcs); - - rreqs[i] = ParallelDescriptor::Arecv(&recvdata[offset], Cnt, Who, SeqNum, - ParallelContext::CommunicatorSub()).req(); - } - - // Send. - for (const auto& kv : mpi_snd_data) { - const auto Who = kv.first; - const auto Cnt = kv.second.size(); - - AMREX_ASSERT(Cnt > 0); - AMREX_ASSERT(Who >= 0 && Who < NProcs); - AMREX_ASSERT(Cnt < std::numeric_limits::max()); - - ParallelDescriptor::Send(kv.second.data(), Cnt, Who, SeqNum, - ParallelContext::CommunicatorSub()); - } - - if (nrcvs > 0) { - ParallelDescriptor::Waitall(rreqs, stats); - - BL_PROFILE_VAR_START(blp_locate); - - int npart = TotRcvBytes / superparticle_size; - - Vector rcv_levs(npart); - Vector rcv_grid(npart); - Vector rcv_tile(npart); - - int ipart = 0; - ParticleLocData pld; - for (int j = 0; j < nrcvs; ++j) - { - const auto offset = rOffset[j]; - const auto Who = RcvProc[j]; - const auto Cnt = Rcvs[Who] / superparticle_size; - for (int i = 0; i < int(Cnt); ++i) - { - char* pbuf = ((char*) &recvdata[offset]) + i*superparticle_size; - - Particle p; - - if constexpr (ParticleType::is_soa_particle) { - std::memcpy(&p.m_idcpu, pbuf, sizeof(uint64_t)); - - ParticleReal pos[AMREX_SPACEDIM]; - std::memcpy(&pos[0], pbuf + sizeof(uint64_t), AMREX_SPACEDIM*sizeof(ParticleReal)); - AMREX_D_TERM(p.pos(0) = pos[0];, - p.pos(1) = pos[1];, - p.pos(2) = pos[2]); - } else { - std::memcpy(&p, pbuf, sizeof(ParticleType)); - } - - bool success = Where(p, pld, lev_min, lev_max, 0); - if (!success) - { - success = (nGrow > 0) && Where(p, pld, lev_min, lev_min, nGrow); - pld.m_grown_gridbox = pld.m_gridbox; // reset grown box for subsequent calls. - } - if (!success) - { - amrex::Abort("RedistributeMPI_locate:: invalid particle."); - } - - rcv_levs[ipart] = pld.m_lev; - rcv_grid[ipart] = pld.m_grid; - rcv_tile[ipart] = pld.m_tile; - - ++ipart; - } - } - - BL_PROFILE_VAR_STOP(blp_locate); - - BL_PROFILE_VAR_START(blp_copy); - -#ifndef AMREX_USE_GPU - ipart = 0; - for (int i = 0; i < nrcvs; ++i) - { - const auto offset = rOffset[i]; - const auto Who = RcvProc[i]; - const auto Cnt = Rcvs[Who] / superparticle_size; - for (int j = 0; j < int(Cnt); ++j) - { - auto& ptile = m_particles[rcv_levs[ipart]][std::make_pair(rcv_grid[ipart], - rcv_tile[ipart])]; - auto old_size = ptile.numTotalParticles(); - auto new_size = old_size + 1; - ptile.resize(new_size, GrowthStrategy::Geometric); - - char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size; - - if constexpr (ParticleType::is_soa_particle) { - uint64_t idcpudata; - std::memcpy(&idcpudata, pbuf, sizeof(uint64_t)); - pbuf += sizeof(uint64_t); - ptile.GetStructOfArrays().GetIdCPUData()[old_size] = idcpudata; - } else { - ParticleType p; - std::memcpy(&p, pbuf, sizeof(ParticleType)); - pbuf += sizeof(ParticleType); - ptile.GetArrayOfStructs()[old_size] = p; - } - - int array_comp_start = 0; - if constexpr (!ParticleType::is_soa_particle) { - array_comp_start = AMREX_SPACEDIM + NStructReal; - } - for (int comp = 0; comp < NumRealComps(); ++comp) { - if (h_redistribute_real_comp[array_comp_start + comp]) { - ParticleReal rdata; - std::memcpy(&rdata, pbuf, sizeof(ParticleReal)); - pbuf += sizeof(ParticleReal); - ptile.GetStructOfArrays().GetRealData(comp)[old_size] = rdata; - } else { - ptile.GetStructOfArrays().GetRealData(comp)[old_size] = 0.0; - } - } - - array_comp_start = 2 + NStructInt; - for (int comp = 0; comp < NumIntComps(); ++comp) { - if (h_redistribute_int_comp[array_comp_start + comp]) { - int idata; - std::memcpy(&idata, pbuf, sizeof(int)); - pbuf += sizeof(int); - ptile.GetStructOfArrays().GetIntData(comp)[old_size] = idata; - } else { - ptile.GetStructOfArrays().GetIntData(comp)[old_size] = 0; - } - } - ++ipart; - } - } - -#else - Vector, Gpu::HostVector > > host_particles; - host_particles.reserve(15); - host_particles.resize(finestLevel()+1); - - Vector, - std::vector > > > host_real_attribs; - host_real_attribs.reserve(15); - host_real_attribs.resize(finestLevel()+1); - - Vector, - std::vector > > > host_int_attribs; - host_int_attribs.reserve(15); - host_int_attribs.resize(finestLevel()+1); - - Vector, Gpu::HostVector > > host_idcpu; - host_idcpu.reserve(15); - host_idcpu.resize(finestLevel()+1); - - ipart = 0; - for (int i = 0; i < nrcvs; ++i) - { - const auto offset = rOffset[i]; - const auto Who = RcvProc[i]; - const auto Cnt = Rcvs[Who] / superparticle_size; - for (auto j = decltype(Cnt)(0); j < Cnt; ++j) - { - int lev = rcv_levs[ipart]; - std::pair ind(std::make_pair(rcv_grid[ipart], rcv_tile[ipart])); - - char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size; - - host_real_attribs[lev][ind].resize(NumRealComps()); - host_int_attribs[lev][ind].resize(NumIntComps()); - - if constexpr (ParticleType::is_soa_particle) { - uint64_t idcpudata; - std::memcpy(&idcpudata, pbuf, sizeof(uint64_t)); - pbuf += sizeof(uint64_t); - host_idcpu[lev][ind].push_back(idcpudata); - } else { - ParticleType p; - std::memcpy(&p, pbuf, sizeof(ParticleType)); - pbuf += sizeof(ParticleType); - host_particles[lev][ind].push_back(p); - } - - host_real_attribs[lev][ind].resize(NumRealComps()); - host_int_attribs[lev][ind].resize(NumIntComps()); - - // add the real... - int array_comp_start = 0; - if constexpr (!ParticleType::is_soa_particle) { - array_comp_start = AMREX_SPACEDIM + NStructReal; - } - for (int comp = 0; comp < NumRealComps(); ++comp) { - if (h_redistribute_real_comp[array_comp_start + comp]) { - ParticleReal rdata; - std::memcpy(&rdata, pbuf, sizeof(ParticleReal)); - pbuf += sizeof(ParticleReal); - host_real_attribs[lev][ind][comp].push_back(rdata); - } else { - host_real_attribs[lev][ind][comp].push_back(0.0); - } - } - - // ... and int array data - array_comp_start = 2 + NStructInt; - for (int comp = 0; comp < NumIntComps(); ++comp) { - if (h_redistribute_int_comp[array_comp_start + comp]) { - int idata; - std::memcpy(&idata, pbuf, sizeof(int)); - pbuf += sizeof(int); - host_int_attribs[lev][ind][comp].push_back(idata); - } else { - host_int_attribs[lev][ind][comp].push_back(0); - } - } - ++ipart; - } - } - - for (int host_lev = 0; host_lev < static_cast(host_particles.size()); ++host_lev) - { - for (auto& kv : host_particles[host_lev]) { - auto grid = kv.first.first; - auto tile = kv.first.second; - const auto& src_tile = kv.second; - - auto& dst_tile = GetParticles(host_lev)[std::make_pair(grid,tile)]; - auto old_size = dst_tile.size(); - auto new_size = old_size + src_tile.size(); - dst_tile.resize(new_size); - - if constexpr (ParticleType::is_soa_particle) { - Gpu::copyAsync(Gpu::hostToDevice, - host_idcpu[host_lev][std::make_pair(grid,tile)].begin(), - host_idcpu[host_lev][std::make_pair(grid,tile)].end(), - dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size); - } else { - Gpu::copyAsync(Gpu::hostToDevice, - src_tile.begin(), src_tile.end(), - dst_tile.GetArrayOfStructs().begin() + old_size); - } - - for (int i = 0; i < NumRealComps(); ++i) { - Gpu::copyAsync(Gpu::hostToDevice, - host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(), - host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(), - dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size); - } - - for (int i = 0; i < NumIntComps(); ++i) { - Gpu::copyAsync(Gpu::hostToDevice, - host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(), - host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(), - dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size); - } - } - } - - Gpu::Device::streamSynchronize(); -#endif - - BL_PROFILE_VAR_STOP(blp_copy); - } -#else - amrex::ignore_unused(not_ours,lev_min,lev_max,nGrow,local); -#endif -} - template class Allocator, class CellAssignor> bool diff --git a/Src/Particle/AMReX_ParticleLocator.H b/Src/Particle/AMReX_ParticleLocator.H index aae714a868e..75fa01ae972 100644 --- a/Src/Particle/AMReX_ParticleLocator.H +++ b/Src/Particle/AMReX_ParticleLocator.H @@ -4,8 +4,11 @@ #include #include +#include #include +#include + namespace amrex { @@ -22,17 +25,21 @@ struct AssignGrid Box m_domain; GpuArray m_plo; GpuArray m_dxi; + Dim3 m_tile_size; + bool m_do_tiling = false; AMREX_GPU_HOST_DEVICE AssignGrid () = default; AssignGrid (BinIteratorFactory a_bif, const IntVect& a_bins_lo, const IntVect& a_bins_hi, const IntVect& a_bin_size, - const IntVect& a_num_bins, const Geometry& a_geom) + const IntVect& a_num_bins, const Geometry& a_geom, + bool a_do_tiling, const IntVect& a_tile_size) : m_bif(a_bif), m_lo(a_bins_lo.dim3()), m_hi(a_bins_hi.dim3()), m_bin_size(a_bin_size.dim3()), m_num_bins(a_num_bins.dim3()), m_domain(a_geom.Domain()), - m_plo(a_geom.ProbLoArray()), m_dxi(a_geom.InvCellSizeArray()) + m_plo(a_geom.ProbLoArray()), m_dxi(a_geom.InvCellSizeArray()), + m_tile_size(a_tile_size.dim3()), m_do_tiling(a_do_tiling) { // clamp bin size and num_bins to 1 for AMREX_SPACEDIM < 3 if (m_bin_size.x >= 0) {m_bin_size.x = amrex::max(m_bin_size.x, 1);} @@ -44,19 +51,27 @@ struct AssignGrid if (m_bin_size.z >= 0) {m_num_bins.z = amrex::max(m_num_bins.z, 1);} } + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + int getTile (const IntVect& iv, const Box& bx) const noexcept + { + Box tbx; + const IntVect tile_size(AMREX_D_DECL(m_tile_size.x, m_tile_size.y, m_tile_size.z)); + return getTileIndex(iv, bx, m_do_tiling, tile_size, tbx); + } + template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - int operator() (const P& p, int nGrow=0, Assignor const& assignor = Assignor{}) const noexcept + std::pair operator() (const P& p, int nGrow=0, Assignor const& assignor = Assignor{}) const noexcept { const auto iv = assignor(p, m_plo, m_dxi, m_domain); return this->operator()(iv, nGrow); } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - int operator() (const IntVect& iv, int nGrow=0) const noexcept + std::pair operator() (const IntVect& iv, int nGrow=0) const noexcept { if (AMREX_D_TERM((m_num_bins.x == 0), && (m_num_bins.y == 0), && (m_num_bins.z == 0))) { - return -1; + return {-1, -1}; } const auto lo = iv.dim3(); int ix_lo = amrex::max((lo.x - nGrow - m_lo.x) / m_bin_size.x - 1, 0); @@ -66,7 +81,8 @@ struct AssignGrid int ix_hi = amrex::min((lo.x + nGrow - m_lo.x) / m_bin_size.x, m_num_bins.x-1); int iy_hi = amrex::min((lo.y + nGrow - m_lo.y) / m_bin_size.y, m_num_bins.y-1); int iz_hi = amrex::min((lo.z + nGrow - m_lo.z) / m_bin_size.z, m_num_bins.z-1); - int loc = -1; + int loc_gid = -1; + int loc_tid = -1; for (int ii = ix_lo; ii <= ix_hi; ++ii) { for (int jj = iy_lo; jj <= iy_hi; ++jj) { for (int kk = iz_lo; kk <= iz_hi; ++kk) { @@ -74,28 +90,31 @@ struct AssignGrid for (const auto& nbor : m_bif.getBinIterator(index)) { Box bx = nbor.second; if (bx.contains(iv)) { - return nbor.first; + return {nbor.first, getTile(iv, bx)}; } Box gbx = bx; gbx.grow(nGrow); if (gbx.contains(iv)) { - if (loc < 0) { - loc = nbor.first; - } - // Prefer particle not in corner ghost cells - for (int dir = 0; dir < AMREX_SPACEDIM; ++dir) { - Box gdbx = bx; - gdbx.grow(dir, nGrow); - if (gdbx.contains(iv)) { - loc = nbor.first; - } - } + int tile = getTile(iv, bx); + if (loc_gid < 0) { + loc_gid = nbor.first; + loc_tid = tile; + } + // Prefer particle not in corner ghost cells + for (int dir = 0; dir < AMREX_SPACEDIM; ++dir) { + Box gdbx = bx; + gdbx.grow(dir, nGrow); + if (gdbx.contains(iv)) { + loc_gid = nbor.first; + loc_tid = tile; + } + } } } } } } - return loc; + return {loc_gid, loc_tid}; } }; @@ -108,9 +127,13 @@ public: ParticleLocator () = default; - void build (const BoxArray& ba, const Geometry& geom) + void build (const BoxArray& ba, const Geometry& geom, + bool a_do_tiling = false, + const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) { m_defined = true; + m_do_tiling = a_do_tiling; + m_tile_size = a_tile_size; m_ba = ba; m_geom = geom; int num_boxes = static_cast(ba.size()); @@ -184,21 +207,30 @@ public: { AMREX_ASSERT(m_defined); return AssignGrid(m_bins.getBinIteratorFactory(), - m_bins_lo, m_bins_hi, m_bin_size, m_num_bins, m_geom); + m_bins_lo, m_bins_hi, m_bin_size, m_num_bins, m_geom, + m_do_tiling, m_tile_size); } - bool isValid (const BoxArray& ba) const noexcept + bool isValid (const BoxArray& ba, + bool a_do_tiling = false, + const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) const noexcept { - if (m_defined) { return BoxArray::SameRefs(m_ba, ba); } + if (m_defined) { + return BoxArray::SameRefs(m_ba, ba) && + (m_do_tiling == a_do_tiling) && + (m_tile_size == a_tile_size); + } return false; } protected: bool m_defined{false}; + bool m_do_tiling{false}; BoxArray m_ba; Geometry m_geom; + IntVect m_tile_size{AMREX_D_DECL(1024000, 1024000, 1024000)}; IntVect m_bins_lo; IntVect m_bins_hi; @@ -223,25 +255,25 @@ struct AmrAssignGrid template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - GpuTuple operator() (const P& p, int lev_min=-1, int lev_max=-1, int nGrow=0, - Assignor const& assignor = {}) const noexcept + GpuTuple operator() (const P& p, int lev_min=-1, int lev_max=-1, int nGrow=0, + Assignor const& assignor = {}) const noexcept { lev_min = (lev_min == -1) ? 0 : lev_min; lev_max = (lev_max == -1) ? m_size - 1 : lev_max; for (int lev = lev_max; lev >= lev_min; --lev) { - int grid = m_funcs[lev](p, 0, assignor); - if (grid >= 0) { return makeTuple(grid, lev); } + auto grid_tile = m_funcs[lev](p, 0, assignor); + if (grid_tile.first >= 0) { return makeTuple(grid_tile.first, grid_tile.second, lev); } } { int lev = lev_min; - int grid = m_funcs[lev](p, nGrow, assignor); - if (grid >= 0) { return makeTuple(grid, lev); } + auto grid_tile = m_funcs[lev](p, nGrow, assignor); + if (grid_tile.first >= 0) { return makeTuple(grid_tile.first, grid_tile.second, lev); } } - return makeTuple(-1, -1); + return makeTuple(-1, -1, -1); } }; @@ -261,18 +293,24 @@ public: AmrParticleLocator() = default; AmrParticleLocator(const Vector& a_ba, - const Vector& a_geom) + const Vector& a_geom, + bool a_do_tiling = false, + const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) { - build(a_ba, a_geom); + build(a_ba, a_geom, a_do_tiling, a_tile_size); } - AmrParticleLocator(const ParGDBBase* a_gdb) + AmrParticleLocator(const ParGDBBase* a_gdb, + bool a_do_tiling = false, + const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) { - build(a_gdb); + build(a_gdb, a_do_tiling, a_tile_size); } void build (const Vector& a_ba, - const Vector& a_geom) + const Vector& a_geom, + bool a_do_tiling = false, + const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) { m_defined = true; int num_levels = static_cast(a_ba.size()); @@ -282,7 +320,7 @@ public: Gpu::HostVector > h_grid_assignors(num_levels); for (int lev = 0; lev < num_levels; ++lev) { - m_locators[lev].build(a_ba[lev], a_geom[lev]); + m_locators[lev].build(a_ba[lev], a_geom[lev], a_do_tiling, a_tile_size); h_grid_assignors[lev] = m_locators[lev].getGridAssignor(); } Gpu::htod_memcpy_async(m_grid_assignors.data(), h_grid_assignors.data(), @@ -291,13 +329,15 @@ public: #else for (int lev = 0; lev < num_levels; ++lev) { - m_locators[lev].build(a_ba[lev], a_geom[lev]); + m_locators[lev].build(a_ba[lev], a_geom[lev], a_do_tiling, a_tile_size); m_grid_assignors[lev] = m_locators[lev].getGridAssignor(); } #endif } - void build (const ParGDBBase* a_gdb) + void build (const ParGDBBase* a_gdb, + bool a_do_tiling = false, + const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) { Vector ba; Vector geom; @@ -307,29 +347,33 @@ public: ba.push_back(a_gdb->ParticleBoxArray(lev)); geom.push_back(a_gdb->Geom(lev)); } - build(ba, geom); + build(ba, geom, a_do_tiling, a_tile_size); } - [[nodiscard]] bool isValid (const Vector& a_ba) const + [[nodiscard]] bool isValid (const Vector& a_ba, + bool a_do_tiling = false, + const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) const { if ( !m_defined || (m_locators.empty()) || (m_locators.size() != a_ba.size()) ) { return false; } bool all_valid = true; int num_levels = m_locators.size(); for (int lev = 0; lev < num_levels; ++lev) { - all_valid = all_valid && m_locators[lev].isValid(a_ba[lev]); + all_valid = all_valid && m_locators[lev].isValid(a_ba[lev], a_do_tiling, a_tile_size); } return all_valid; } - bool isValid (const ParGDBBase* a_gdb) const + bool isValid (const ParGDBBase* a_gdb, + bool a_do_tiling = false, + const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) const { Vector ba; int num_levels = a_gdb->finestLevel()+1; for (int lev = 0; lev < num_levels; ++lev) { ba.push_back(a_gdb->ParticleBoxArray(lev)); } - return this->isValid(ba); + return this->isValid(ba, a_do_tiling, a_tile_size); } void setGeometry (const ParGDBBase* a_gdb) diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H index 9b8e055d031..c781dbd3266 100644 --- a/Src/Particle/AMReX_ParticleUtil.H +++ b/Src/Particle/AMReX_ParticleUtil.H @@ -550,7 +550,7 @@ partitionParticles (PTile& ptile, ParFunc const& is_left) // it compensates for the extra kernel launches and evaluations of is_left which this // algorithm needs. - ParallelFor(max_num_swaps, + ParallelForOMP(max_num_swaps, [=] AMREX_GPU_DEVICE (int i) { int left_i = p_index_left[i]; @@ -618,7 +618,7 @@ partitionParticles (PTile& ptile, int num_left, ParFunc const& is_left) }, Scan::Type::exclusive, Scan::noRetSum); - ParallelFor(max_num_swaps, + ParallelForOMP(max_num_swaps, [=] AMREX_GPU_DEVICE (int i) { int left_i = p_index_left[i]; @@ -643,8 +643,6 @@ removeInvalidParticles (PTile& ptile) ptile.resize(new_size); } -#if defined(AMREX_USE_GPU) - template int partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const& assignor, @@ -654,7 +652,7 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const const GpuArray& rlo, const GpuArray& rhi, const GpuArray& is_per, - int lev, int gid, int /*tid*/, + int lev, int gid, int tid, int lev_min, int lev_max, int nGrow, bool remove_negative) { auto getPID = pmap.getPIDFunctor(); @@ -666,15 +664,17 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const // the function for determining if a particle stays on this grid is very slow, // so we cache it in particle_stays to avoid evaluating it multiple times. - ParallelFor(ptile.numParticles(), + ParallelForOMP(ptile.numParticles(), [=] AMREX_GPU_DEVICE (int i) { int assigned_grid; + int assigned_tile; int assigned_lev; if (!ptd.id(i).is_valid()) { assigned_grid = -1; + assigned_tile = -1; assigned_lev = -1; } else @@ -683,7 +683,8 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const enforcePeriodic(p_prime, plo, phi, rlo, rhi, is_per); auto tup_prime = ploc(p_prime, lev_min, lev_max, nGrow, assignor); assigned_grid = amrex::get<0>(tup_prime); - assigned_lev = amrex::get<1>(tup_prime); + assigned_tile = amrex::get<1>(tup_prime); + assigned_lev = amrex::get<2>(tup_prime); if (assigned_grid >= 0) { AMREX_D_TERM(ptd.pos(0, i) = p_prime.pos(0);, @@ -697,13 +698,15 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const p_prime.pos(2) = ptd.pos(2, i);); auto tup = ploc(p_prime, lev_min, lev_max, nGrow, assignor); assigned_grid = amrex::get<0>(tup); - assigned_lev = amrex::get<1>(tup); + assigned_tile = amrex::get<1>(tup); + assigned_lev = amrex::get<2>(tup); } } p_particle_stays[i] = uint8_t( ((remove_negative == false) && (!ptd.id(i).is_valid())) || - ((assigned_grid == gid) && (assigned_lev == lev) && (getPID(lev, gid) == pid))); + ((assigned_grid == gid) && (assigned_tile == tid) && + (assigned_lev == lev) && (getPID(lev, gid, tid) == pid))); }); return partitionParticles(ptile, @@ -712,8 +715,6 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const }); } -#endif - template bool SameIteratorsOK (const PC1& pc1, const PC2& pc2) { if (pc1.numLevels() != pc2.numLevels()) {return false;} diff --git a/Tests/Particles/Intersection/main.cpp b/Tests/Particles/Intersection/main.cpp index 48ccb3c0bcd..a021107b0a7 100644 --- a/Tests/Particles/Intersection/main.cpp +++ b/Tests/Particles/Intersection/main.cpp @@ -92,7 +92,7 @@ void testIntersection() auto* const grids_ptr = device_grids.dataPtr(); amrex::ParallelFor(num_cells, [=] AMREX_GPU_DEVICE (int j) noexcept { - grids_ptr[j] = assign_grid(cells_ptr[j]); + grids_ptr[j] = assign_grid(cells_ptr[j]).first; }); ReduceOps reduce_op; diff --git a/Tests/Particles/ParallelContext/main.cpp b/Tests/Particles/ParallelContext/main.cpp index 4a55193eb27..b37fb752cac 100644 --- a/Tests/Particles/ParallelContext/main.cpp +++ b/Tests/Particles/ParallelContext/main.cpp @@ -73,6 +73,7 @@ class TestParticleContainer const int lev = 0; // only add particles on level 0 const Real* dx = Geom(lev).CellSize(); const Real* plo = Geom(lev).ProbLo(); + const IntVect dom_lo = Geom(lev).Domain().smallEnd(); const int num_ppc = AMREX_D_TERM( a_num_particles_per_cell[0], *a_num_particles_per_cell[1], @@ -98,12 +99,12 @@ class TestParticleContainer ParticleType p; p.id() = ParticleType::NextID(); p.cpu() = ParallelDescriptor::MyProc(); - p.pos(0) = static_cast (plo[0] + (iv[0] + r[0])*dx[0]); + p.pos(0) = static_cast (plo[0] + ((iv[0] - dom_lo[0]) + r[0])*dx[0]); #if AMREX_SPACEDIM > 1 - p.pos(1) = static_cast (plo[1] + (iv[1] + r[1])*dx[1]); + p.pos(1) = static_cast (plo[1] + ((iv[1] - dom_lo[1]) + r[1])*dx[1]); #endif #if AMREX_SPACEDIM > 2 - p.pos(2) = static_cast (plo[2] + (iv[2] + r[2])*dx[2]); + p.pos(2) = static_cast (plo[2] + ((iv[2] - dom_lo[2]) + r[2])*dx[2]); #endif for (int i = 0; i < NSR; ++i) { p.rdata(i) = ParticleReal(p.id()); } diff --git a/Tests/Particles/Redistribute/CMakeLists.txt b/Tests/Particles/Redistribute/CMakeLists.txt index cdae8d13d47..891631a3f24 100644 --- a/Tests/Particles/Redistribute/CMakeLists.txt +++ b/Tests/Particles/Redistribute/CMakeLists.txt @@ -8,6 +8,34 @@ foreach(D IN LISTS AMReX_SPACEDIM) setup_test(${D} _sources _input_files) + if (D EQUAL 3 AND NOT AMReX_GPU_BACKEND STREQUAL NONE) + set(_exe_dir ${CMAKE_CURRENT_BINARY_DIR}/${D}d) + set(_exe_name Test_Particles_Redistribute_${D}d) + + file(COPY inputs.rt.cuda.tiling DESTINATION ${_exe_dir}) + + if (AMReX_OMP) + add_test( + NAME Particles_Redistribute_${D}d_Tiling + COMMAND ${_exe_dir}/${_exe_name} inputs.rt.cuda.tiling + WORKING_DIRECTORY ${_exe_dir} + ) + set_tests_properties(Particles_Redistribute_${D}d_Tiling PROPERTIES ENVIRONMENT OMP_NUM_THREADS=2) + elseif (AMReX_MPI) + add_test( + NAME Particles_Redistribute_${D}d_Tiling + COMMAND mpiexec -n 2 ${_exe_dir}/${_exe_name} inputs.rt.cuda.tiling + WORKING_DIRECTORY ${_exe_dir} + ) + else () + add_test( + NAME Particles_Redistribute_${D}d_Tiling + COMMAND ${_exe_dir}/${_exe_name} inputs.rt.cuda.tiling + WORKING_DIRECTORY ${_exe_dir} + ) + endif () + endif () + unset(_sources) unset(_input_files) endforeach() diff --git a/Tests/Particles/Redistribute/inputs.rt.cuda.tiling b/Tests/Particles/Redistribute/inputs.rt.cuda.tiling new file mode 100644 index 00000000000..62a114f03ac --- /dev/null +++ b/Tests/Particles/Redistribute/inputs.rt.cuda.tiling @@ -0,0 +1,14 @@ +redistribute.size = (32, 64, 64) +redistribute.max_grid_size = 32 +redistribute.is_periodic = 1 +redistribute.num_ppc = 1 +redistribute.move_dir = (1, 1, 1) +redistribute.do_random = 1 +redistribute.nsteps = 100 +redistribute.nlevs = 1 +redistribute.do_regrid = 1 + +redistribute.num_runtime_real = 2 +redistribute.num_runtime_int = 3 + +particles.do_tiling=1 \ No newline at end of file diff --git a/Tests/Particles/Redistribute/main.cpp b/Tests/Particles/Redistribute/main.cpp index adc65598eba..a9aede8a161 100644 --- a/Tests/Particles/Redistribute/main.cpp +++ b/Tests/Particles/Redistribute/main.cpp @@ -90,6 +90,9 @@ class TestParticleContainer *a_num_particles_per_cell[1], *a_num_particles_per_cell[2]); +#ifdef AMREX_USE_OMP +#pragma omp parallel if (Gpu::notInLaunchRegion()) +#endif for(MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi) { const Box& tile_box = mfi.tilebox(); @@ -189,6 +192,9 @@ class TestParticleContainer { BL_PROFILE("TestParticleContainer::moveParticles"); +#ifdef AMREX_USE_OMP +#pragma omp parallel if (Gpu::notInLaunchRegion()) +#endif for (int lev = 0; lev <= finestLevel(); ++lev) { const auto dx = Geom(lev).CellSizeArray(); @@ -241,6 +247,9 @@ class TestParticleContainer { BL_PROFILE("TestParticleContainer::invalidateEven"); +#ifdef AMREX_USE_OMP +#pragma omp parallel if (Gpu::notInLaunchRegion()) +#endif for (int lev = 0; lev <= finestLevel(); ++lev) { auto& plev = GetParticles(lev); @@ -272,6 +281,9 @@ class TestParticleContainer int num_rr = NumRuntimeRealComps(); int num_ii = NumRuntimeIntComps(); +#ifdef AMREX_USE_OMP +#pragma omp parallel if (Gpu::notInLaunchRegion()) +#endif for (int lev = 0; lev <= finestLevel(); ++lev) { const auto& plev = GetParticles(lev);