diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
index 64b6bec6729..ae649093636 100644
--- a/Src/Base/AMReX_GpuAtomic.H
+++ b/Src/Base/AMReX_GpuAtomic.H
@@ -18,8 +18,7 @@ namespace Gpu::Atomic {
 // For LogicalOr and LogicalAnd, the data type is int.
 // For Exch and CAS, the data type is generic.
 // All these functions are non-atomic in host code!!!
-// If one needs them to be atomic in host code, use HostDevice::Atomic::*.  Currently only
-// HostDevice::Atomic::Add is supported.  We could certainly add more.
+// If one needs them to be atomic in host code, use HostDevice::Atomic::*.
 // If we add more types for atomicAdd, we also need to update HasAtomicAdd in AMReX_TypeTraits.H.
 
 /// \cond DOXYGEN_IGNORE
@@ -617,6 +616,21 @@ namespace HostDevice::Atomic {
         *sum += value;
     }
 
+    template <class T>
+    AMREX_FORCE_INLINE
+    T FetchAdd_Host (T* const sum, T const value) noexcept
+    {
+        T old;
+#ifdef AMREX_USE_OMP
+#pragma omp atomic capture
+#endif
+        {
+            old = *sum;
+            *sum += value;
+        }
+        return old;
+    }
+
     template <class T>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Add (T* const sum, T const value) noexcept
@@ -625,6 +639,14 @@ namespace HostDevice::Atomic {
         AMREX_IF_ON_HOST((Add_Host(sum,value);))
     }
 
+    template <class T>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T FetchAdd (T* const sum, T const value) noexcept
+    {
+        AMREX_IF_ON_DEVICE((return Gpu::Atomic::Add(sum,value);))
+        AMREX_IF_ON_HOST((return FetchAdd_Host(sum,value);))
+    }
+
 }
 
 #ifdef AMREX_USE_GPU
diff --git a/Src/Particle/AMReX_NeighborParticles.H b/Src/Particle/AMReX_NeighborParticles.H
index 8e13f7a44d3..8d348c4f795 100644
--- a/Src/Particle/AMReX_NeighborParticles.H
+++ b/Src/Particle/AMReX_NeighborParticles.H
@@ -16,6 +16,7 @@ namespace amrex {
   struct NeighborCode
   {
       int grid_id;
+      Box grid_box;
       IntVect periodic_shift;
   };
 
diff --git a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
index 45b3b4ba228..d165ecc01e7 100644
--- a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
+++ b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
@@ -99,6 +99,7 @@ buildNeighborMask ()
             {
                 NeighborCode code;
                 code.grid_id        = nbor_grid.grid_id;
+                code.grid_box       = ba[nbor_grid.grid_id];
                 code.periodic_shift = nbor_grid.periodic_shift;
                 h_code_arr.push_back(code);
                 h_isec_boxes.push_back(nbor_grid.box);
@@ -170,6 +171,8 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
         auto p_code_array   = m_code_array[gid].dataPtr();
         auto p_isec_boxes   = m_isec_boxes[gid].dataPtr();
         const int nisec_box = m_isec_boxes[gid].size();
+        const bool do_tiling = this->do_tiling;
+        const IntVect tile_size = this->tile_size;
         // auto p_code_offsets = m_code_offsets[gid].dataPtr();
 
         AMREX_FOR_1D ( np, i,
@@ -194,12 +197,14 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
         Gpu::dtoh_memcpy_async(&num_copies, offsets.data()+np, sizeof(int));
         Gpu::streamSynchronize();
 
-        neighbor_copy_op.resize(gid, lev, num_copies);
+        neighbor_copy_op.resize(gid, tid, lev, num_copies);
 
-        auto p_boxes = neighbor_copy_op.m_boxes[lev][gid].dataPtr();
-        auto p_levs = neighbor_copy_op.m_levels[lev][gid].dataPtr();
-        auto p_src_indices = neighbor_copy_op.m_src_indices[lev][gid].dataPtr();
-        auto p_periodic_shift = neighbor_copy_op.m_periodic_shift[lev][gid].dataPtr();
+        auto tile_index = std::make_pair(gid, tid);
+        auto p_boxes = neighbor_copy_op.m_boxes[lev][tile_index].dataPtr();
+        auto p_levs = neighbor_copy_op.m_levels[lev][tile_index].dataPtr();
+        auto p_tiles = neighbor_copy_op.m_tiles[lev][tile_index].dataPtr();
+        auto p_src_indices = neighbor_copy_op.m_src_indices[lev][tile_index].dataPtr();
+        auto p_periodic_shift = neighbor_copy_op.m_periodic_shift[lev][tile_index].dataPtr();
 
         Gpu::streamSynchronize();
         AMREX_FOR_1D ( np, i,
@@ -213,6 +218,9 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
             for (int j=0; j<nisec_box; ++j) {
                 if (p_isec_boxes[j].contains(iv)) {
                     p_boxes[k]          = p_code_array[j].grid_id;
+                    Box tbx;
+                    p_tiles[k]          = getTileIndex(iv, p_code_array[j].grid_box,
+                                                       do_tiling, tile_size, tbx);
                     p_levs[k]           = 0;
                     p_periodic_shift[k] = p_code_array[j].periodic_shift;
                     p_src_indices[k]    = pid;
@@ -235,6 +243,9 @@ fillNeighborsGPU ()
 
     AMREX_ASSERT(numParticlesOutOfRange(*this, 0) == 0);
 
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(this->do_tiling == 0,
+        "Tiling on the GPU is not supported for neighbor particles.");
+
     buildNeighborMask();
     this->defineBufferMap();
 
diff --git a/Src/Particle/AMReX_ParticleBufferMap.H b/Src/Particle/AMReX_ParticleBufferMap.H
index 33cec4b8236..656fef35dc9 100644
--- a/Src/Particle/AMReX_ParticleBufferMap.H
+++ b/Src/Particle/AMReX_ParticleBufferMap.H
@@ -13,39 +13,45 @@ namespace amrex {
 struct GetPID
 {
     const int* m_bucket_to_pid;
-    const int* m_lev_gid_to_bucket;
+    const int* m_lev_gid_tid_to_bucket;
     const int* m_lev_offsets;
+    const int* m_gid_offsets;
 
     GetPID (const Gpu::DeviceVector<int>& bucket_to_pid,
-            const Gpu::DeviceVector<int>& lev_gid_to_bucket,
-            const Gpu::DeviceVector<int>& lev_offsets)
+            const Gpu::DeviceVector<int>& lev_gid_tid_to_bucket,
+            const Gpu::DeviceVector<int>& lev_offsets,
+            const Gpu::DeviceVector<int>& gid_offsets)
         : m_bucket_to_pid(bucket_to_pid.dataPtr()),
-          m_lev_gid_to_bucket(lev_gid_to_bucket.dataPtr()),
-          m_lev_offsets(lev_offsets.dataPtr())
+          m_lev_gid_tid_to_bucket(lev_gid_tid_to_bucket.dataPtr()),
+          m_lev_offsets(lev_offsets.dataPtr()),
+          m_gid_offsets(gid_offsets.dataPtr())
         {}
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int operator() (const int lev, const int gid) const noexcept
+    int operator() (const int lev, const int gid, const int tid) const noexcept
     {
-        return m_bucket_to_pid[m_lev_gid_to_bucket[m_lev_offsets[lev]+gid]];
+        return m_bucket_to_pid[m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev]+gid] + tid]];
     }
 };
 
 struct GetBucket
 {
-    const int* m_lev_gid_to_bucket;
+    const int* m_lev_gid_tid_to_bucket;
     const int* m_lev_offsets;
-
-    GetBucket (const int* lev_gid_to_bucket_ptr,
-               const int* lev_offsets_ptr)
-        : m_lev_gid_to_bucket(lev_gid_to_bucket_ptr),
-          m_lev_offsets(lev_offsets_ptr)
+    const int* m_gid_offsets;
+
+    GetBucket (const int* lev_gid_tid_to_bucket_ptr,
+               const int* lev_offsets_ptr,
+               const int* gid_offsets_ptr)
+        : m_lev_gid_tid_to_bucket(lev_gid_tid_to_bucket_ptr),
+          m_lev_offsets(lev_offsets_ptr),
+          m_gid_offsets(gid_offsets_ptr)
         {}
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int operator() (const int lev, const int gid) const noexcept
+    int operator() (const int lev, const int gid, const int tid) const noexcept
     {
-        return m_lev_gid_to_bucket[m_lev_offsets[lev]+gid];
+        return m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev]+gid] + tid];
     }
 };
 
@@ -56,28 +62,40 @@ class ParticleBufferMap
     Vector<DistributionMapping> m_dm;
 
     Vector<int> m_bucket_to_gid;
+    Vector<int> m_bucket_to_tid;
     Vector<int> m_bucket_to_lev;
     Vector<int> m_bucket_to_pid;
 
-    Vector<int> m_lev_gid_to_bucket;
+    Vector<int> m_lev_gid_tid_to_bucket;
     Vector<int> m_lev_offsets;
+    Vector<int> m_gid_offsets;
 
     Vector<int> m_proc_box_counts;
     Vector<int> m_proc_box_offsets;
 
+    bool m_do_tiling{false};
+    IntVect m_tile_size{AMREX_D_DECL(1024000, 1024000, 1024000)};
+
     Gpu::DeviceVector<int> d_bucket_to_pid;
-    Gpu::DeviceVector<int> d_lev_gid_to_bucket;
+    Gpu::DeviceVector<int> d_lev_gid_tid_to_bucket;
     Gpu::DeviceVector<int> d_lev_offsets;
+    Gpu::DeviceVector<int> d_gid_offsets;
 
 public:
     ParticleBufferMap () = default;
 
     ParticleBufferMap (const ParGDBBase* a_gdb);
 
+    ParticleBufferMap (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size);
+
     void define (const ParGDBBase* a_gdb);
 
+    void define (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size);
+
     bool isValid (const ParGDBBase* a_gdb) const;
 
+    bool isValid (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size) const;
+
     [[nodiscard]] AMREX_FORCE_INLINE
     int numLevels () const
     {
@@ -99,6 +117,13 @@ public:
         return m_bucket_to_gid[bid];
     }
 
+    [[nodiscard]] AMREX_FORCE_INLINE
+    int bucketToTile (int bid) const
+    {
+        AMREX_ASSERT(m_defined);
+        return m_bucket_to_tid[bid];
+    }
+
     [[nodiscard]] AMREX_FORCE_INLINE
     int bucketToLevel (int bid) const
     {
@@ -115,9 +140,16 @@ public:
 
     [[nodiscard]] AMREX_FORCE_INLINE
     int gridAndLevToBucket (int gid, int lev) const
+    {
+        AMREX_ASSERT(!m_do_tiling);
+        return gridAndTileAndLevToBucket(gid, 0, lev);
+    }
+
+    [[nodiscard]] AMREX_FORCE_INLINE
+    int gridAndTileAndLevToBucket (int gid, int tid, int lev) const
     {
         AMREX_ASSERT(m_defined);
-        return m_lev_gid_to_bucket[m_lev_offsets[lev] + gid];
+        return m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev] + gid] + tid];
     }
 
     [[nodiscard]] AMREX_FORCE_INLINE
@@ -148,14 +180,21 @@ public:
 
     [[nodiscard]] AMREX_FORCE_INLINE
     int procID (int gid, int lev) const
+    {
+        AMREX_ASSERT(!m_do_tiling);
+        return procID(gid, 0, lev);
+    }
+
+    [[nodiscard]] AMREX_FORCE_INLINE
+    int procID (int gid, int tid, int lev) const
     {
         AMREX_ASSERT(m_defined);
-        return m_dm[lev][gid];
+        return m_bucket_to_pid[gridAndTileAndLevToBucket(gid, tid, lev)];
     }
 
-    [[nodiscard]] GetPID getPIDFunctor () const noexcept { return GetPID(d_bucket_to_pid, d_lev_gid_to_bucket, d_lev_offsets);}
-    [[nodiscard]] GetBucket getBucketFunctor () const noexcept { return GetBucket(d_lev_gid_to_bucket.data(), d_lev_offsets.data());}
-    [[nodiscard]] GetBucket getHostBucketFunctor () const noexcept { return GetBucket(m_lev_gid_to_bucket.data(), m_lev_offsets.data());}
+    [[nodiscard]] GetPID getPIDFunctor () const noexcept { return GetPID(d_bucket_to_pid, d_lev_gid_tid_to_bucket, d_lev_offsets, d_gid_offsets);}
+    [[nodiscard]] GetBucket getBucketFunctor () const noexcept { return GetBucket(d_lev_gid_tid_to_bucket.data(), d_lev_offsets.data(), d_gid_offsets.data());}
+    [[nodiscard]] GetBucket getHostBucketFunctor () const noexcept { return GetBucket(m_lev_gid_tid_to_bucket.data(), m_lev_offsets.data(), m_gid_offsets.data());}
 
 };
 
diff --git a/Src/Particle/AMReX_ParticleBufferMap.cpp b/Src/Particle/AMReX_ParticleBufferMap.cpp
index 95fd1a9b69c..d9b3d72a484 100644
--- a/Src/Particle/AMReX_ParticleBufferMap.cpp
+++ b/Src/Particle/AMReX_ParticleBufferMap.cpp
@@ -1,17 +1,32 @@
 #include <AMReX_ParticleBufferMap.H>
+#include <AMReX_ParticleUtil.H>
 
 namespace amrex {
 
 ParticleBufferMap::ParticleBufferMap (const ParGDBBase* a_gdb)
 {
-    define(a_gdb);
+    define(a_gdb, false, IntVect(AMREX_D_DECL(1024000, 1024000, 1024000)));
+}
+
+ParticleBufferMap::ParticleBufferMap (const ParGDBBase* a_gdb, bool a_do_tiling,
+                                      const IntVect& a_tile_size)
+{
+    define(a_gdb, a_do_tiling, a_tile_size);
 }
 
 void ParticleBufferMap::define (const ParGDBBase* a_gdb)
+{
+    define(a_gdb, false, IntVect(AMREX_D_DECL(1024000, 1024000, 1024000)));
+}
+
+void ParticleBufferMap::define (const ParGDBBase* a_gdb, bool a_do_tiling,
+                                const IntVect& a_tile_size)
 {
     BL_PROFILE("ParticleBufferMap::define");
 
     m_defined = true;
+    m_do_tiling = a_do_tiling;
+    m_tile_size = a_tile_size;
 
     int num_levels = a_gdb->finestLevel()+1;
     m_ba.resize(0);
@@ -27,57 +42,75 @@ void ParticleBufferMap::define (const ParGDBBase* a_gdb)
     m_lev_offsets.resize(0);
     m_lev_offsets.push_back(0);
     for (int lev = 0; lev < num_levels; ++lev) {
-        m_lev_offsets.push_back(m_lev_offsets.back() +static_cast<int>( m_ba[lev].size()));
+        m_lev_offsets.push_back(m_lev_offsets.back() + static_cast<int>(m_ba[lev].size()) + 1);
     }
 
-    int num_buckets = m_lev_offsets.back();
+    m_gid_offsets.resize(m_lev_offsets.back(), 0);
+
+    int num_buckets = 0;
+    for (int lev = 0; lev < num_levels; ++lev) {
+        int level_offset = m_lev_offsets[lev];
+        for (int gid = 0; gid < m_ba[lev].size(); ++gid) {
+            m_gid_offsets[level_offset + gid] = num_buckets;
+            num_buckets += numTilesInBox(m_ba[lev][gid], m_do_tiling, m_tile_size);
+        }
+        m_gid_offsets[level_offset + static_cast<int>(m_ba[lev].size())] = num_buckets;
+    }
 
     m_bucket_to_gid.resize(0);
     m_bucket_to_gid.resize(num_buckets);
+    m_bucket_to_tid.resize(0);
+    m_bucket_to_tid.resize(num_buckets);
     m_bucket_to_lev.resize(0);
     m_bucket_to_lev.resize(num_buckets);
     m_bucket_to_pid.resize(0);
     m_bucket_to_pid.resize(num_buckets);
 
-    m_lev_gid_to_bucket.resize(0);
-    m_lev_gid_to_bucket.resize(num_buckets);
+    m_lev_gid_tid_to_bucket.resize(0);
+    m_lev_gid_tid_to_bucket.resize(num_buckets);
 
-    using ThreeIntTuple = std::tuple<int, int, int>;
-    std::vector<ThreeIntTuple> box_lev_proc_ids;
+    using FourIntTuple = std::tuple<int, int, int, int>;
+    std::vector<FourIntTuple> box_tile_lev_proc_ids;
 
     for (int lev = 0; lev < num_levels; ++lev) {
         for (int i = 0; i < m_ba[lev].size(); ++i) {
             int rank = ParallelContext::global_to_local_rank(m_dm[lev][i]);
-            box_lev_proc_ids.emplace_back(i, lev, rank);
+            int num_tiles = numTilesInBox(m_ba[lev][i], m_do_tiling, m_tile_size);
+            for (int tid = 0; tid < num_tiles; ++tid) {
+                box_tile_lev_proc_ids.emplace_back(i, tid, lev, rank);
+            }
         }
     }
 
-    std::sort(box_lev_proc_ids.begin(), box_lev_proc_ids.end(),
-              [](const ThreeIntTuple& a, const ThreeIntTuple& b) -> bool
+    std::sort(box_tile_lev_proc_ids.begin(), box_tile_lev_proc_ids.end(),
+              [](const FourIntTuple& a, const FourIntTuple& b) -> bool
               {
-                  int pid_a = std::get<2>(a);
-                  int pid_b = std::get<2>(b);
+                  int pid_a = std::get<3>(a);
+                  int pid_b = std::get<3>(b);
                   if (pid_a != pid_b) { return pid_a < pid_b; }
 
-                  int lev_a = std::get<1>(a);
-                  int lev_b = std::get<1>(b);
+                  int lev_a = std::get<2>(a);
+                  int lev_b = std::get<2>(b);
                   if (lev_a != lev_b) { return lev_a < lev_b; }
 
                   int gid_a = std::get<0>(a);
                   int gid_b = std::get<0>(b);
                   if (gid_a != gid_b) { return gid_a < gid_b; }
 
+                  int tid_a = std::get<1>(a);
+                  int tid_b = std::get<1>(b);
+                  if (tid_a != tid_b) { return tid_a < tid_b; }
+
                   return false;
               });
 
     int bucket_index = 0;
-    for (int lev = 0; lev < num_levels; ++lev) {
-        for (int i = 0; i < m_ba[lev].size(); ++i) {
-            m_bucket_to_gid[bucket_index] = std::get<0>(box_lev_proc_ids[bucket_index]);
-            m_bucket_to_lev[bucket_index] = std::get<1>(box_lev_proc_ids[bucket_index]);
-            m_bucket_to_pid[bucket_index] = std::get<2>(box_lev_proc_ids[bucket_index]);
-            ++bucket_index;
-        }
+    for (int i = 0; i < num_buckets; ++i) {
+        m_bucket_to_gid[bucket_index] = std::get<0>(box_tile_lev_proc_ids[bucket_index]);
+        m_bucket_to_tid[bucket_index] = std::get<1>(box_tile_lev_proc_ids[bucket_index]);
+        m_bucket_to_lev[bucket_index] = std::get<2>(box_tile_lev_proc_ids[bucket_index]);
+        m_bucket_to_pid[bucket_index] = std::get<3>(box_tile_lev_proc_ids[bucket_index]);
+        ++bucket_index;
     }
 
     m_proc_box_counts.resize(0);
@@ -86,8 +119,10 @@ void ParticleBufferMap::define (const ParGDBBase* a_gdb)
     for (int i = 0; i < num_buckets; ++i)
     {
         int lev = m_bucket_to_lev[i];
+        int gid = m_bucket_to_gid[i];
+        int tid = m_bucket_to_tid[i];
         int pid = m_bucket_to_pid[i];
-        m_lev_gid_to_bucket[m_lev_offsets[lev] + m_bucket_to_gid[i]] = i;
+        m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev] + gid] + tid] = i;
         m_proc_box_counts[pid]++;
     }
 
@@ -100,21 +135,33 @@ void ParticleBufferMap::define (const ParGDBBase* a_gdb)
     d_bucket_to_pid.resize(0);
     d_bucket_to_pid.resize(num_buckets);
 
-    d_lev_gid_to_bucket.resize(0);
-    d_lev_gid_to_bucket.resize(num_buckets);
+    d_lev_gid_tid_to_bucket.resize(0);
+    d_lev_gid_tid_to_bucket.resize(num_buckets);
 
     d_lev_offsets.resize(0);
     d_lev_offsets.resize(m_lev_offsets.size());
 
-    Gpu::copyAsync(Gpu::hostToDevice, m_lev_gid_to_bucket.begin(),m_lev_gid_to_bucket.end(),d_lev_gid_to_bucket.begin());
+    d_gid_offsets.resize(0);
+    d_gid_offsets.resize(m_gid_offsets.size());
+
+    Gpu::copyAsync(Gpu::hostToDevice, m_lev_gid_tid_to_bucket.begin(),m_lev_gid_tid_to_bucket.end(),d_lev_gid_tid_to_bucket.begin());
     Gpu::copyAsync(Gpu::hostToDevice, m_lev_offsets.begin(),m_lev_offsets.end(),d_lev_offsets.begin());
+    Gpu::copyAsync(Gpu::hostToDevice, m_gid_offsets.begin(),m_gid_offsets.end(),d_gid_offsets.begin());
     Gpu::copyAsync(Gpu::hostToDevice, m_bucket_to_pid.begin(),m_bucket_to_pid.end(),d_bucket_to_pid.begin());
     Gpu::streamSynchronize();
 }
 
 bool ParticleBufferMap::isValid (const ParGDBBase* a_gdb) const
+{
+    return isValid(a_gdb, false, IntVect(AMREX_D_DECL(1024000, 1024000, 1024000)));
+}
+
+bool ParticleBufferMap::isValid (const ParGDBBase* a_gdb, bool a_do_tiling,
+                                 const IntVect& a_tile_size) const
 {
     if (!m_defined) { return false; }
+    if (m_do_tiling != a_do_tiling) { return false; }
+    if (m_tile_size != a_tile_size) { return false; }
 
     int num_levs = a_gdb->finestLevel() + 1;
     if (num_levs != m_ba.size()) { return false; }
diff --git a/Src/Particle/AMReX_ParticleCommunication.H b/Src/Particle/AMReX_ParticleCommunication.H
index cc148c1ce5c..58496e389e6 100644
--- a/Src/Particle/AMReX_ParticleCommunication.H
+++ b/Src/Particle/AMReX_ParticleCommunication.H
@@ -7,12 +7,16 @@
 #include <AMReX_IntVect.H>
 #include <AMReX_ParticleBufferMap.H>
 #include <AMReX_MFIter.H>
+#include <AMReX_OpenMP.H>
 #include <AMReX_Scan.H>
 #include <AMReX_TypeTraits.H>
 #include <AMReX_MakeParticle.H>
 #include <AMReX_ParmParse.H>
 
+#include <algorithm>
 #include <map>
+#include <numeric>
+#include <utility>
 
 namespace amrex {
 
@@ -59,21 +63,24 @@ struct RedistributeUnpackPolicy
 
 struct ParticleCopyOp
 {
-    Vector<std::map<int, Gpu::DeviceVector<int> > > m_boxes;
-    Vector<std::map<int, Gpu::DeviceVector<int> > > m_levels;
-    Vector<std::map<int, Gpu::DeviceVector<int> > > m_src_indices;
-    Vector<std::map<int, Gpu::DeviceVector<IntVect> > > m_periodic_shift;
+    using TileKey = std::pair<int, int>;
+
+    Vector<std::map<TileKey, Gpu::DeviceVector<int> > > m_boxes;
+    Vector<std::map<TileKey, Gpu::DeviceVector<int> > > m_levels;
+    Vector<std::map<TileKey, Gpu::DeviceVector<int> > > m_tiles;
+    Vector<std::map<TileKey, Gpu::DeviceVector<int> > > m_src_indices;
+    Vector<std::map<TileKey, Gpu::DeviceVector<IntVect> > > m_periodic_shift;
 
     void clear ();
 
     void setNumLevels (int num_levels);
 
-    void resize (int gid, int lev, int size);
+    void resize (int gid, int tid, int lev, int size);
 
-    [[nodiscard]] int numCopies (int gid, int lev) const
+    [[nodiscard]] int numCopies (TileKey const& index, int lev) const
     {
         if (m_boxes.size() <= lev) { return 0; }
-        auto mit = m_boxes[lev].find(gid);
+        auto mit = m_boxes[lev].find(index);
         return mit == m_boxes[lev].end() ? 0 : int(mit->second.size());
     }
 
@@ -82,7 +89,272 @@ struct ParticleCopyOp
 
 struct ParticleCopyPlan
 {
-    Vector<std::map<int, Gpu::DeviceVector<int> > > m_dst_indices;
+    using TileKey = std::pair<int, int>;
+
+    struct StableOrderedAlgorithm {};
+    struct TwoPassHostAlgorithm {};
+    struct AtomicScatterAlgorithm {};
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+    struct OmpCopyWork
+    {
+        const int* boxes = nullptr;
+        const int* levs = nullptr;
+        const int* tiles = nullptr;
+        int* dst_indices = nullptr;
+        int num_copies = 0;
+    };
+#endif
+
+    struct BuildWorkspace
+    {
+        explicit BuildWorkspace (int a_num_buckets)
+            : num_buckets(a_num_buckets)
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+            , omp_copy_offsets(1, 0)
+#endif
+        {}
+
+        int num_buckets = 0;
+        Gpu::HostVector<unsigned int> h_box_counts;
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+        Vector<unsigned int> omp_thread_box_counts;
+        Vector<OmpCopyWork> omp_copy_work;
+        Vector<Long> omp_copy_offsets;
+#endif
+    };
+
+    template <class PC, class F>
+    void forEachCopyBatch (const PC& pc, const ParticleCopyOp& op, F&& f)
+    {
+        const int num_levels = op.numLevels();
+        m_dst_indices.resize(num_levels);
+        auto&& batch_fn = std::forward<F>(f);
+
+        for (int lev = 0; lev < num_levels; ++lev)
+        {
+            for (const auto& kv : pc.GetParticles(lev))
+            {
+                auto index = kv.first;
+                int num_copies = op.numCopies(index, lev);
+                if (num_copies == 0) { continue; }
+
+                auto& dst_indices = m_dst_indices[lev][index];
+                dst_indices.resize(num_copies);
+
+                batch_fn(lev, index, num_copies, dst_indices);
+            }
+        }
+    }
+
+    template <class PC, class GetBucket>
+    void buildCopies (const PC& pc, const ParticleCopyOp& op,
+                      StableOrderedAlgorithm, BuildWorkspace& workspace, GetBucket const& getBucket)
+    {
+        BL_PROFILE("ParticleCopyPlan::buildCopiesStableOrdered");
+
+        forEachCopyBatch(pc, op,
+        [&] (int lev, TileKey const& index, int num_copies, Gpu::DeviceVector<int>& dst_indices)
+        {
+#ifdef AMREX_USE_GPU
+            const Gpu::DeviceVector<int>& d_boxes = op.m_boxes[lev].at(index);
+            Gpu::HostVector<int> h_boxes(d_boxes.size());
+            Gpu::copy(Gpu::deviceToHost, d_boxes.begin(), d_boxes.end(), h_boxes.begin());
+
+            const Gpu::DeviceVector<int>& d_levs = op.m_levels[lev].at(index);
+            Gpu::HostVector<int> h_levs(d_levs.size());
+            Gpu::copy(Gpu::deviceToHost, d_levs.begin(), d_levs.end(), h_levs.begin());
+
+            const Gpu::DeviceVector<int>& d_tiles = op.m_tiles[lev].at(index);
+            Gpu::HostVector<int> h_tiles(d_tiles.size());
+            Gpu::copy(Gpu::deviceToHost, d_tiles.begin(), d_tiles.end(), h_tiles.begin());
+
+        Gpu::HostVector<int> h_dst_indices(num_copies);
+#else
+            const Gpu::DeviceVector<int>& h_boxes = op.m_boxes[lev].at(index);
+            const Gpu::DeviceVector<int>& h_levs = op.m_levels[lev].at(index);
+            const Gpu::DeviceVector<int>& h_tiles = op.m_tiles[lev].at(index);
+
+        Gpu::DeviceVector<int>& h_dst_indices = dst_indices;
+#endif
+            for (int i = 0; i < num_copies; ++i) {
+                int dst_box = h_boxes[i];
+                if (dst_box >= 0) {
+                    int dst_tile = h_tiles[i];
+                    int dst_lev = h_levs[i];
+                    int dst_index = static_cast<int>(workspace.h_box_counts[getBucket(dst_lev, dst_box, dst_tile)]++);
+                    h_dst_indices[i] = dst_index;
+                }
+            }
+
+#ifdef AMREX_USE_GPU
+            Gpu::copy(Gpu::hostToDevice,
+                      h_dst_indices.begin(), h_dst_indices.end(),
+                      dst_indices.begin());
+#endif
+        });
+    }
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+    template <class PC, class GetBucket>
+    void buildCopies (const PC& pc, const ParticleCopyOp& op,
+                      TwoPassHostAlgorithm, BuildWorkspace& workspace, GetBucket const& getBucket)
+    {
+        BL_PROFILE("ParticleCopyPlan::buildCopiesTwoPassHost");
+
+        workspace.omp_thread_box_counts.resize(OpenMP::get_max_threads()*workspace.num_buckets, 0U);
+
+        forEachCopyBatch(pc, op,
+        [&op, &workspace] (int lev, TileKey const& index, int num_copies, Gpu::DeviceVector<int>& dst_indices)
+        {
+            const auto* p_boxes = op.m_boxes[lev].at(index).dataPtr();
+            const auto* p_levs = op.m_levels[lev].at(index).dataPtr();
+            const auto* p_tiles = op.m_tiles[lev].at(index).dataPtr();
+            auto* p_dst_indices = dst_indices.dataPtr();
+
+            workspace.omp_copy_work.push_back({p_boxes, p_levs, p_tiles, p_dst_indices, num_copies});
+            workspace.omp_copy_offsets.push_back(workspace.omp_copy_offsets.back() + num_copies);
+        });
+
+        if (workspace.omp_copy_work.empty()) { return; }
+
+        std::fill(workspace.omp_thread_box_counts.begin(), workspace.omp_thread_box_counts.end(), 0U);
+        auto* p_omp_thread_box_counts = workspace.omp_thread_box_counts.data();
+        const auto* p_omp_copy_work = workspace.omp_copy_work.data();
+        const auto* p_omp_copy_offsets = workspace.omp_copy_offsets.data();
+        const Long total_num_copies = workspace.omp_copy_offsets.back();
+
+#pragma omp parallel
+        {
+            int thread_num = OpenMP::get_thread_num();
+            int num_threads = OpenMP::get_num_threads();
+            Long ibegin = thread_num*total_num_copies/num_threads;
+            Long iend = (thread_num+1)*total_num_copies/num_threads;
+            auto* p_thread_box_counts = p_omp_thread_box_counts + thread_num*workspace.num_buckets;
+
+            if (ibegin < iend)
+            {
+                int iwork = static_cast<int>(std::upper_bound(workspace.omp_copy_offsets.begin(),
+                                                              workspace.omp_copy_offsets.end(),
+                                                              ibegin)
+                                             - workspace.omp_copy_offsets.begin()) - 1;
+                while (iwork < static_cast<int>(workspace.omp_copy_work.size()) &&
+                       p_omp_copy_offsets[iwork] < iend)
+                {
+                    auto const& work = p_omp_copy_work[iwork];
+                    Long global_begin = std::max(ibegin, p_omp_copy_offsets[iwork]);
+                    Long global_end = std::min(iend, p_omp_copy_offsets[iwork+1]);
+                    int local_begin = static_cast<int>(global_begin - p_omp_copy_offsets[iwork]);
+                    int local_end = static_cast<int>(global_end - p_omp_copy_offsets[iwork]);
+                    for (int i = local_begin; i < local_end; ++i)
+                    {
+                        int dst_box = work.boxes[i];
+                        if (dst_box >= 0)
+                        {
+                            int dst_tile = work.tiles[i];
+                            int dst_lev = work.levs[i];
+                            ++p_thread_box_counts[getBucket(dst_lev, dst_box, dst_tile)];
+                        }
+                    }
+                    ++iwork;
+                }
+            }
+
+#pragma omp barrier
+#pragma omp for
+            for (int ibucket = 0; ibucket < workspace.num_buckets; ++ibucket)
+            {
+                unsigned int offset = workspace.h_box_counts[ibucket];
+                for (int tid = 0; tid < num_threads; ++tid)
+                {
+                    auto& count = p_omp_thread_box_counts[tid*workspace.num_buckets + ibucket];
+                    unsigned int total = count;
+                    count = offset;
+                    offset += total;
+                }
+                workspace.h_box_counts[ibucket] = offset;
+            }
+
+            if (ibegin < iend)
+            {
+                int iwork = static_cast<int>(std::upper_bound(workspace.omp_copy_offsets.begin(),
+                                                              workspace.omp_copy_offsets.end(),
+                                                              ibegin)
+                                             - workspace.omp_copy_offsets.begin()) - 1;
+                while (iwork < static_cast<int>(workspace.omp_copy_work.size()) &&
+                       p_omp_copy_offsets[iwork] < iend)
+                {
+                    auto const& work = p_omp_copy_work[iwork];
+                    Long global_begin = std::max(ibegin, p_omp_copy_offsets[iwork]);
+                    Long global_end = std::min(iend, p_omp_copy_offsets[iwork+1]);
+                    int local_begin = static_cast<int>(global_begin - p_omp_copy_offsets[iwork]);
+                    int local_end = static_cast<int>(global_end - p_omp_copy_offsets[iwork]);
+                    for (int i = local_begin; i < local_end; ++i)
+                    {
+                        int dst_box = work.boxes[i];
+                        if (dst_box >= 0)
+                        {
+                            int dst_tile = work.tiles[i];
+                            int dst_lev = work.levs[i];
+                            int bucket = getBucket(dst_lev, dst_box, dst_tile);
+                            work.dst_indices[i] = static_cast<int>(p_thread_box_counts[bucket]++);
+                        }
+                    }
+                    ++iwork;
+                }
+            }
+        }
+    }
+#endif
+
+    template <class PC, class GetBucket>
+    void buildCopies (const PC& pc, const ParticleCopyOp& op,
+                      AtomicScatterAlgorithm, BuildWorkspace&, GetBucket const& getBucket)
+    {
+        BL_PROFILE("ParticleCopyPlan::buildCopiesAtomicScatter");
+
+        auto* p_dst_box_counts = m_box_counts_d.dataPtr();
+
+        forEachCopyBatch(pc, op,
+        [&op, &getBucket, p_dst_box_counts] (int lev, TileKey const& index,
+                                             int num_copies, Gpu::DeviceVector<int>& dst_indices)
+        {
+            const auto* p_boxes = op.m_boxes[lev].at(index).dataPtr();
+            const auto* p_levs = op.m_levels[lev].at(index).dataPtr();
+            const auto* p_tiles = op.m_tiles[lev].at(index).dataPtr();
+            auto* p_dst_indices = dst_indices.dataPtr();
+
+            amrex::ParallelFor(num_copies, [=] AMREX_GPU_DEVICE (int i)
+            {
+                int dst_box = p_boxes[i];
+                if (dst_box >= 0)
+                {
+                    int dst_tile = p_tiles[i];
+                    int dst_lev = p_levs[i];
+                    int dst_index = static_cast<int>(HostDevice::Atomic::FetchAdd(
+                        &p_dst_box_counts[getBucket(dst_lev, dst_box, dst_tile)], 1U));
+                    p_dst_indices[i] = dst_index;
+                }
+            });
+        });
+    }
+
+    void finalizeBuildBoxCounts (BuildWorkspace const& workspace, bool use_host_box_counters)
+    {
+        if (use_host_box_counters) {
+            Gpu::copy(Gpu::hostToDevice,
+                      workspace.h_box_counts.begin(), workspace.h_box_counts.end(),
+                      m_box_counts_d.begin());
+        }
+
+        amrex::Gpu::exclusive_scan(m_box_counts_d.begin(), m_box_counts_d.end(),
+                                   m_box_offsets.begin());
+    }
+
+public:
+
+    Vector<std::map<TileKey, Gpu::DeviceVector<int> > > m_dst_indices;
 
     Gpu::DeviceVector<unsigned int> m_box_counts_d;
     Gpu::HostVector<unsigned int>   m_box_counts_h;
@@ -91,6 +363,7 @@ struct ParticleCopyPlan
     Vector<int> m_rcv_box_counts;
     Vector<int> m_rcv_box_offsets;
     Vector<int> m_rcv_box_ids;
+    Vector<int> m_rcv_box_tids;
     Vector<int> m_rcv_box_pids;
     Vector<int> m_rcv_box_levs;
 
@@ -135,22 +408,18 @@ struct ParticleCopyPlan
                 const ParticleCopyOp& op,
                 const Vector<int>& int_comp_mask,
                 const Vector<int>& real_comp_mask,
-                bool local)
+                int local)
     {
         BL_PROFILE("ParticleCopyPlan::build");
 
-        m_local = local;
         ParmParse pp("particles");
         pp.query("do_one_sided_comms", m_do_one_sided_comms);
+          const int num_buckets = pc.BufferMap().numBuckets();
 
-        const int ngrow = 1;  // note - fix
-
-        const int num_levels = op.numLevels();
-        const int num_buckets = pc.BufferMap().numBuckets();
-
-        if (m_local)
+        m_local = local;
+        if (local)
         {
-            m_neighbor_procs = pc.NeighborProcs(ngrow);
+            m_neighbor_procs = pc.NeighborProcs(local);
         }
         else
         {
@@ -161,69 +430,32 @@ struct ParticleCopyPlan
         m_box_counts_d.resize(0);
         m_box_counts_d.resize(num_buckets+1, 0);
         m_box_offsets.resize(num_buckets+1);
-        auto* p_dst_box_counts = m_box_counts_d.dataPtr();
-        auto getBucket = pc.stableRedistribute() ? pc.BufferMap().getHostBucketFunctor() : pc.BufferMap().getBucketFunctor();
 
-        Gpu::HostVector<unsigned int> h_box_counts;
-        if (pc.stableRedistribute() ) {
-            h_box_counts.resize(m_box_counts_d.size(), 0);
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+        constexpr bool use_host_bucket_counters = true;
+#else
+        constexpr bool use_host_bucket_counters = false;
+#endif
+        BuildWorkspace workspace(num_buckets);
+        bool use_host_box_counters = pc.stableRedistribute() || use_host_bucket_counters;
+        if (use_host_box_counters) {
+            workspace.h_box_counts.resize(m_box_counts_d.size(), 0);
         }
 
-        m_dst_indices.resize(num_levels);
-        for (int lev = 0; lev < num_levels; ++lev)
+        if (pc.stableRedistribute())
         {
-            for (const auto& kv : pc.GetParticles(lev))
-            {
-                int gid = kv.first.first;
-                int num_copies = op.numCopies(gid, lev);
-                if (num_copies == 0) { continue; }
-                m_dst_indices[lev][gid].resize(num_copies);
-
-                if (pc.stableRedistribute()) {
-                    const Gpu::DeviceVector<int>& d_boxes = op.m_boxes[lev].at(gid);
-                    Gpu::HostVector<int> h_boxes(d_boxes.size());
-                    Gpu::copy(Gpu::deviceToHost,d_boxes.begin(),d_boxes.end(),h_boxes.begin());
-
-                    const Gpu::DeviceVector<int>& d_levs = op.m_levels[lev].at(gid);
-                    Gpu::HostVector<int> h_levs(d_levs.size());
-                    Gpu::copy(Gpu::deviceToHost,d_levs.begin(),d_levs.end(),h_levs.begin());
-
-                    Gpu::HostVector<int> h_dst_indices(num_copies);
-                    for (int i = 0; i < num_copies; ++i) {
-                        int dst_box = h_boxes[i];
-                        if (dst_box >= 0) {
-                            int dst_lev = h_levs[i];
-                            int index = static_cast<int>(h_box_counts[getBucket(dst_lev, dst_box)]++);
-                            h_dst_indices[i] = index;
-                        }
-                    }
-                    Gpu::copy(Gpu::hostToDevice,h_dst_indices.begin(),h_dst_indices.end(),m_dst_indices[lev][gid].begin());
-                }
-                else {
-                    const auto* p_boxes = op.m_boxes[lev].at(gid).dataPtr();
-                    const auto* p_levs = op.m_levels[lev].at(gid).dataPtr();
-                    auto* p_dst_indices = m_dst_indices[lev][gid].dataPtr();
-                    AMREX_FOR_1D ( num_copies, i,
-                    {
-                        int dst_box = p_boxes[i];
-                        if (dst_box >= 0)
-                        {
-                            int dst_lev = p_levs[i];
-                            int index = static_cast<int>(Gpu::Atomic::Add(
-                                &p_dst_box_counts[getBucket(dst_lev, dst_box)], 1U));
-                            p_dst_indices[i] = index;
-                        }
-                    });
-                }
-            }
+            buildCopies(pc, op, StableOrderedAlgorithm{}, workspace, pc.BufferMap().getHostBucketFunctor());
         }
-
-        if (pc.stableRedistribute()) {
-            Gpu::copy(Gpu::hostToDevice,h_box_counts.begin(),h_box_counts.end(),m_box_counts_d.begin());
+        else
+        {
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+            buildCopies(pc, op, TwoPassHostAlgorithm{}, workspace, pc.BufferMap().getBucketFunctor());
+#else
+            buildCopies(pc, op, AtomicScatterAlgorithm{}, workspace, pc.BufferMap().getBucketFunctor());
+#endif
         }
 
-        amrex::Gpu::exclusive_scan(m_box_counts_d.begin(), m_box_counts_d.end(),
-                                   m_box_offsets.begin());
+        finalizeBuildBoxCounts(workspace, use_host_box_counters);
 
         m_box_counts_h.resize(m_box_counts_d.size());
         Gpu::copyAsync(Gpu::deviceToHost, m_box_counts_d.begin(), m_box_counts_d.end(),
@@ -333,10 +565,10 @@ struct GetSendBufferOffset
     {}
 
     AMREX_FORCE_INLINE AMREX_GPU_DEVICE
-    Long operator() (int dst_box, int dst_lev, std::size_t psize, int i) const
+    Long operator() (int dst_box, int dst_tile, int dst_lev, std::size_t psize, int i) const
     {
-        int dst_pid = m_get_pid(dst_lev, dst_box);
-        Long dst_offset = Long(psize)*(m_box_offsets[m_get_bucket(dst_lev, dst_box)] + i);
+        int dst_pid = m_get_pid(dst_lev, dst_box, dst_tile);
+        Long dst_offset = Long(psize)*(m_box_offsets[m_get_bucket(dst_lev, dst_box, dst_tile)] + i);
         dst_offset += Long(m_pad_correction[dst_pid]);
         return dst_offset;
     }
@@ -381,36 +613,53 @@ void packBuffer (const PC& pc, const ParticleCopyOp& op, const ParticleCopyPlan&
     const auto plo = pc.Geom(0).ProbLoArray();
     const auto phi = pc.Geom(0).ProbHiArray();
     const auto is_per = pc.Geom(0).isPeriodicArray();
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+    struct OmpPackWork
+    {
+        typename PC::ParticleTileType const* src_tile = nullptr;
+        const int* boxes = nullptr;
+        const int* levels = nullptr;
+        const int* tiles = nullptr;
+        const int* src_indices = nullptr;
+        const IntVect* periodic_shift = nullptr;
+        const int* dst_indices = nullptr;
+        int num_copies = 0;
+    };
+    Vector<OmpPackWork> omp_pack_work;
+    Vector<Long> omp_pack_offsets(1, 0);
+#endif
     for (int lev = 0; lev < num_levels; ++lev)
     {
         auto& plev = pc.GetParticles(lev);
         for (auto& kv : plev)
         {
-            int gid = kv.first.first;
-            int tid = kv.first.second;
-            auto index = std::make_pair(gid, tid);
-
+            auto index = kv.first;
             auto& src_tile = plev.at(index);
-            const auto& ptd = src_tile.getConstParticleTileData();
-
-            int num_copies = op.numCopies(gid, lev);
+            int num_copies = op.numCopies(index, lev);
             if (num_copies == 0) { continue; }
 
-            const auto* p_boxes = op.m_boxes[lev].at(gid).dataPtr();
-            const auto* p_levels = op.m_levels[lev].at(gid).dataPtr();
-            const auto* p_src_indices = op.m_src_indices[lev].at(gid).dataPtr();
-            const auto* p_periodic_shift = op.m_periodic_shift[lev].at(gid).dataPtr();
-            const auto* p_dst_indices = plan.m_dst_indices[lev].at(gid).dataPtr();
+            const auto* p_boxes = op.m_boxes[lev].at(index).dataPtr();
+            const auto* p_levels = op.m_levels[lev].at(index).dataPtr();
+            const auto* p_tiles = op.m_tiles[lev].at(index).dataPtr();
+            const auto* p_src_indices = op.m_src_indices[lev].at(index).dataPtr();
+            const auto* p_periodic_shift = op.m_periodic_shift[lev].at(index).dataPtr();
+            const auto* p_dst_indices = plan.m_dst_indices[lev].at(index).dataPtr();
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+            omp_pack_work.push_back({&src_tile, p_boxes, p_levels, p_tiles,
+                                     p_src_indices, p_periodic_shift, p_dst_indices, num_copies});
+            omp_pack_offsets.push_back(omp_pack_offsets.back() + num_copies);
+#else
+            const auto& ptd = src_tile.getConstParticleTileData();
             auto* p_snd_buffer = snd_buffer.dataPtr();
             GetSendBufferOffset get_offset(plan, pc.BufferMap());
-
-            AMREX_FOR_1D ( num_copies, i,
+            amrex::ParallelFor(num_copies, [=] AMREX_GPU_DEVICE (int i)
             {
                 int dst_box = p_boxes[i];
                 if (dst_box >= 0)
                 {
+                    int dst_tile = p_tiles[i];
                     int dst_lev = p_levels[i];
-                    auto dst_offset = get_offset(dst_box, dst_lev, psize, p_dst_indices[i]);
+                    auto dst_offset = get_offset(dst_box, dst_tile, dst_lev, psize, p_dst_indices[i]);
                     int src_index = p_src_indices[i];
                     ptd.packParticleData(p_snd_buffer, src_index, dst_offset, p_comm_real, p_comm_int);
 
@@ -444,8 +693,87 @@ void packBuffer (const PC& pc, const ParticleCopyOp& op, const ParticleCopyPlan&
                     }
                 }
             });
+#endif
         }
     }
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+    if (!omp_pack_work.empty())
+    {
+        auto* p_snd_buffer = snd_buffer.dataPtr();
+        GetSendBufferOffset get_offset(plan, pc.BufferMap());
+        const Long total_num_copies = omp_pack_offsets.back();
+
+#pragma omp parallel
+        {
+            int thread_num = OpenMP::get_thread_num();
+            int num_threads = OpenMP::get_num_threads();
+            Long ibegin = thread_num*total_num_copies/num_threads;
+            Long iend = (thread_num+1)*total_num_copies/num_threads;
+
+            if (ibegin < iend)
+            {
+                int iwork = static_cast<int>(std::upper_bound(omp_pack_offsets.begin(),
+                                                              omp_pack_offsets.end(),
+                                                              ibegin)
+                                             - omp_pack_offsets.begin()) - 1;
+                while (iwork < static_cast<int>(omp_pack_work.size()) &&
+                       omp_pack_offsets[iwork] < iend)
+                {
+                    auto const& work = omp_pack_work[iwork];
+                    auto const& ptd = work.src_tile->getConstParticleTileData();
+                    Long global_begin = std::max(ibegin, omp_pack_offsets[iwork]);
+                    Long global_end = std::min(iend, omp_pack_offsets[iwork+1]);
+                    int local_begin = static_cast<int>(global_begin - omp_pack_offsets[iwork]);
+                    int local_end = static_cast<int>(global_end - omp_pack_offsets[iwork]);
+                    for (int i = local_begin; i < local_end; ++i)
+                    {
+                        int dst_box = work.boxes[i];
+                        if (dst_box >= 0)
+                        {
+                            int dst_tile = work.tiles[i];
+                            int dst_lev = work.levels[i];
+                            auto dst_offset = get_offset(dst_box, dst_tile, dst_lev, psize,
+                                                         work.dst_indices[i]);
+                            int src_index = work.src_indices[i];
+                            ptd.packParticleData(p_snd_buffer, src_index, dst_offset,
+                                                 p_comm_real, p_comm_int);
+
+                            const IntVect& pshift = work.periodic_shift[i];
+                            bool do_periodic_shift =
+                                AMREX_D_TERM( (is_per[0] && pshift[0] != 0),
+                                           || (is_per[1] && pshift[1] != 0),
+                                           || (is_per[2] && pshift[2] != 0) );
+
+                            if (do_periodic_shift)
+                            {
+                                ParticleReal pos[AMREX_SPACEDIM];
+                                Long pos_offset = dst_offset;
+                                if constexpr (PC::ParticleType::is_soa_particle) {
+                                    pos_offset += sizeof(uint64_t);
+                                }
+                                amrex::Gpu::memcpy(&pos[0], &p_snd_buffer[pos_offset],
+                                                   AMREX_SPACEDIM*sizeof(ParticleReal));
+                                for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
+                                {
+                                    if (! is_per[idim]) { continue; }
+                                    if (pshift[idim] > 0) {
+                                        pos[idim] += phi[idim] - plo[idim];
+                                    } else if (pshift[idim] < 0) {
+                                        pos[idim] -= phi[idim] - plo[idim];
+                                    }
+                                }
+                                amrex::Gpu::memcpy(&p_snd_buffer[pos_offset], &pos[0],
+                                                   AMREX_SPACEDIM*sizeof(ParticleReal));
+                            }
+                        }
+                    }
+                    ++iwork;
+                }
+            }
+        }
+    }
+#endif
 }
 
 template <class PC, class Buffer, class UnpackPolicy,
@@ -469,7 +797,7 @@ void unpackBuffer (PC& pc, const ParticleCopyPlan& plan, const Buffer& snd_buffe
             int gid = mfi.index();
             int tid = mfi.LocalTileIndex();
             auto& tile = pc.DefineAndReturnParticleTile(lev, gid, tid);
-            int num_copies = plan.m_box_counts_h[pc.BufferMap().gridAndLevToBucket(gid, lev)];
+            int num_copies = plan.m_box_counts_h[pc.BufferMap().gridAndTileAndLevToBucket(gid, tid, lev)];
             sizes.push_back(num_copies);
             tiles.push_back(&tile);
         }
@@ -482,6 +810,20 @@ void unpackBuffer (PC& pc, const ParticleCopyPlan& plan, const Buffer& snd_buffe
     const auto* p_comm_real = plan.d_real_comp_mask.dataPtr();
     const auto* p_comm_int  = plan.d_int_comp_mask.dataPtr();
 
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+    struct OmpUnpackWork
+    {
+        PTile* tile = nullptr;
+        int gid = 0;
+        int tid = 0;
+        int lev = 0;
+        int offset = 0;
+        int size = 0;
+    };
+    Vector<OmpUnpackWork> omp_unpack_work;
+    Vector<Long> omp_unpack_offsets(1, 0);
+#endif
+
     // local unpack
     int uindex = 0;
     for (int lev = 0; lev < num_levels; ++lev)
@@ -496,21 +838,69 @@ void unpackBuffer (PC& pc, const ParticleCopyPlan& plan, const Buffer& snd_buffe
             auto& tile = plev[index];
 
             GetSendBufferOffset get_offset(plan, pc.BufferMap());
-            auto p_snd_buffer = snd_buffer.dataPtr();
 
             int offset = offsets[uindex];
             int size = sizes[uindex];
             ++uindex;
 
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+            omp_unpack_work.push_back({&tile, gid, tid, lev, offset, size});
+            omp_unpack_offsets.push_back(omp_unpack_offsets.back() + size);
+#else
+            auto p_snd_buffer = snd_buffer.dataPtr();
             auto ptd = tile.getParticleTileData();
-            AMREX_FOR_1D ( size, i,
+            amrex::ParallelFor(size, [=] AMREX_GPU_DEVICE (int i)
             {
-                auto src_offset = get_offset(gid, lev, psize, i);
+                auto src_offset = get_offset(gid, tid, lev, psize, i);
                 int dst_index = offset + i;
                 ptd.unpackParticleData(p_snd_buffer, src_offset, dst_index, p_comm_real, p_comm_int);
             });
+#endif
+        }
+    }
+
+#if defined(AMREX_USE_OMP) && !defined(AMREX_USE_GPU)
+    if (!omp_unpack_work.empty())
+    {
+        GetSendBufferOffset get_offset(plan, pc.BufferMap());
+        auto p_snd_buffer = snd_buffer.dataPtr();
+        const Long total_num_copies = omp_unpack_offsets.back();
+
+#pragma omp parallel
+        {
+            int thread_num = OpenMP::get_thread_num();
+            int num_threads = OpenMP::get_num_threads();
+            Long ibegin = thread_num*total_num_copies/num_threads;
+            Long iend = (thread_num+1)*total_num_copies/num_threads;
+
+            if (ibegin < iend)
+            {
+                int iwork = static_cast<int>(std::upper_bound(omp_unpack_offsets.begin(),
+                                                              omp_unpack_offsets.end(),
+                                                              ibegin)
+                                             - omp_unpack_offsets.begin()) - 1;
+                while (iwork < static_cast<int>(omp_unpack_work.size()) &&
+                       omp_unpack_offsets[iwork] < iend)
+                {
+                    auto const& work = omp_unpack_work[iwork];
+                    auto ptd = work.tile->getParticleTileData();
+                    Long global_begin = std::max(ibegin, omp_unpack_offsets[iwork]);
+                    Long global_end = std::min(iend, omp_unpack_offsets[iwork+1]);
+                    int local_begin = static_cast<int>(global_begin - omp_unpack_offsets[iwork]);
+                    int local_end = static_cast<int>(global_end - omp_unpack_offsets[iwork]);
+                    for (int i = local_begin; i < local_end; ++i)
+                    {
+                        auto src_offset = get_offset(work.gid, work.tid, work.lev, psize, i);
+                        int dst_index = work.offset + i;
+                        ptd.unpackParticleData(p_snd_buffer, src_offset, dst_index,
+                                               p_comm_real, p_comm_int);
+                    }
+                    ++iwork;
+                }
+            }
         }
     }
+#endif
 }
 
 template <class PC, class SndBuffer, class RcvBuffer,
@@ -640,7 +1030,7 @@ void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, Un
             int copy_size = plan.m_rcv_box_counts[i];
             int lev = plan.m_rcv_box_levs[i];
             int gid = plan.m_rcv_box_ids[i];
-            int tid = 0;
+            int tid = plan.m_rcv_box_tids[i];
             auto& tile = pc.DefineAndReturnParticleTile(lev, gid, tid);
             sizes.push_back(copy_size);
             tiles.push_back(&tile);
@@ -655,7 +1045,7 @@ void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, Un
         {
             int lev = plan.m_rcv_box_levs[i];
             int gid = plan.m_rcv_box_ids[i];
-            int tid = 0;
+            int tid = plan.m_rcv_box_tids[i];
             auto offset = plan.m_rcv_box_offsets[i];
             procindex = (rproc == plan.m_rcv_box_pids[i]) ? procindex : procindex+1;
             rproc = plan.m_rcv_box_pids[i];
@@ -673,8 +1063,9 @@ void unpackRemotes (PC& pc, const ParticleCopyPlan& plan, Buffer& rcv_buffer, Un
             Long psize = plan.superParticleSize();
             const auto* p_pad_adjust = plan.m_rcv_pad_correction_d.dataPtr();
 
-            AMREX_FOR_1D ( size, ip, {
-                Long src_offset = psize*(offset + ip) + p_pad_adjust[procindex];
+            amrex::ParallelForOMP(size, [=] AMREX_GPU_DEVICE (int ip) {
+                Long src_offset = psize * static_cast<Long>(offset + ip)
+                    + static_cast<Long>(p_pad_adjust[procindex]);
                 int dst_index = dst_offset + ip;
                 ptd.unpackParticleData(p_rcv_buffer, src_offset, dst_index,
                                        p_comm_real, p_comm_int);
diff --git a/Src/Particle/AMReX_ParticleCommunication.cpp b/Src/Particle/AMReX_ParticleCommunication.cpp
index c6835dd86db..898c3f78612 100644
--- a/Src/Particle/AMReX_ParticleCommunication.cpp
+++ b/Src/Particle/AMReX_ParticleCommunication.cpp
@@ -8,6 +8,7 @@ void ParticleCopyOp::clear ()
 {
     m_boxes.resize(0);
     m_levels.resize(0);
+    m_tiles.resize(0);
     m_src_indices.resize(0);
     m_periodic_shift.resize(0);
 }
@@ -16,20 +17,23 @@ void ParticleCopyOp::setNumLevels (int num_levels)
 {
     m_boxes.resize(num_levels);
     m_levels.resize(num_levels);
+    m_tiles.resize(num_levels);
     m_src_indices.resize(num_levels);
     m_periodic_shift.resize(num_levels);
 }
 
-void ParticleCopyOp::resize (int gid, int lev, int size)
+void ParticleCopyOp::resize (int gid, int tid, int lev, int size)
 {
     if (lev >= m_boxes.size())
     {
         setNumLevels(lev+1);
     }
-    m_boxes[lev][gid].resize(size);
-    m_levels[lev][gid].resize(size);
-    m_src_indices[lev][gid].resize(size);
-    m_periodic_shift[lev][gid].resize(size);
+    const auto index = std::make_pair(gid, tid);
+    m_boxes[lev][index].resize(size);
+    m_levels[lev][index].resize(size);
+    m_tiles[lev][index].resize(size);
+    m_src_indices[lev][index].resize(size);
+    m_periodic_shift[lev][index].resize(size, IntVect(AMREX_D_DECL(0,0,0)));
 }
 
 void ParticleCopyPlan::clear ()
@@ -42,6 +46,7 @@ void ParticleCopyPlan::clear ()
     m_rcv_box_counts.clear();
     m_rcv_box_offsets.clear();
     m_rcv_box_ids.clear();
+    m_rcv_box_tids.clear();
     m_rcv_box_pids.clear();
     m_rcv_box_levs.clear();
 }
@@ -79,6 +84,7 @@ void ParticleCopyPlan::buildMPIStart (const ParticleContainerBase& pc, const Par
         for (auto bucket : box_buffer_indices)
         {
             int dst = map.bucketToGrid(bucket);
+            int tid = map.bucketToTile(bucket);
             int lev = map.bucketToLevel(bucket);
             AMREX_ASSERT(m_box_counts_h[bucket] <= static_cast<unsigned int>(std::numeric_limits<int>::max()));
             int npart = static_cast<int>(m_box_counts_h[bucket]);
@@ -87,9 +93,10 @@ void ParticleCopyPlan::buildMPIStart (const ParticleContainerBase& pc, const Par
             if (i == MyProc) { continue; }
             snd_data[i].push_back(npart);
             snd_data[i].push_back(dst);
+            snd_data[i].push_back(tid);
             snd_data[i].push_back(lev);
             snd_data[i].push_back(MyProc);
-            nbytes += 4*sizeof(int);
+            nbytes += 5*sizeof(int);
         }
         m_Snds[i] = nbytes;
         m_NumSnds += nbytes;
@@ -229,17 +236,19 @@ void ParticleCopyPlan::buildMPIFinish (const ParticleBufferMap& map) // NOLINT(r
         m_rcv_box_offsets.resize(0);
         m_rcv_box_counts.resize(0);
         m_rcv_box_ids.resize(0);
+        m_rcv_box_tids.resize(0);
         m_rcv_box_levs.resize(0);
         m_rcv_box_pids.resize(0);
 
         m_rcv_box_offsets.push_back(0);
-        for (int i = 0, N = static_cast<int>(m_rcv_data.size()); i < N; i+=4)
+        for (int i = 0, N = static_cast<int>(m_rcv_data.size()); i < N; i+=5)
         {
             m_rcv_box_counts.push_back(m_rcv_data[i]);
-            AMREX_ASSERT(ParallelContext::MyProcSub() == map.procID(m_rcv_data[i+1], m_rcv_data[i+2]));
+            AMREX_ASSERT(ParallelContext::MyProcSub() == map.procID(m_rcv_data[i+1], m_rcv_data[i+2], m_rcv_data[i+3]));
             m_rcv_box_ids.push_back(m_rcv_data[i+1]);
-            m_rcv_box_levs.push_back(m_rcv_data[i+2]);
-            m_rcv_box_pids.push_back(m_rcv_data[i+3]);
+            m_rcv_box_tids.push_back(m_rcv_data[i+2]);
+            m_rcv_box_levs.push_back(m_rcv_data[i+3]);
+            m_rcv_box_pids.push_back(m_rcv_data[i+4]);
             m_rcv_box_offsets.push_back(m_rcv_box_offsets.back() + m_rcv_box_counts.back());
         }
     }
@@ -251,7 +260,7 @@ void ParticleCopyPlan::buildMPIFinish (const ParticleBufferMap& map) // NOLINT(r
         const auto Cnt    = m_Rcvs[Who]/sizeof(int);
 
         Long nparticles = 0;
-        for (auto i = offset; i < offset + Cnt; i +=4)
+        for (auto i = offset; i < offset + Cnt; i +=5)
         {
             nparticles += m_rcv_data[i];
         }
diff --git a/Src/Particle/AMReX_ParticleContainer.H b/Src/Particle/AMReX_ParticleContainer.H
index e7e1feb8f5e..128eb1a7d16 100644
--- a/Src/Particle/AMReX_ParticleContainer.H
+++ b/Src/Particle/AMReX_ParticleContainer.H
@@ -1283,8 +1283,8 @@ public:
     void RedistributeCPU (int lev_min = 0, int lev_max = -1, int nGrow = 0, int local=0,
                           bool remove_negative=true);
 
-    void RedistributeGPU (int lev_min = 0, int lev_max = -1, int nGrow = 0, int local=0,
-                          bool remove_negative=true);
+    void Redistribute_impl (int lev_min = 0, int lev_max = -1, int nGrow = 0, int local=0,
+                            bool remove_negative=true);
 
     void ReserveForRedistribute (ParticleCopyPlan const& plan);
 
@@ -1546,6 +1546,17 @@ private:
     void RedistributeMPI (std::map<int, Vector<char> >& not_ours,
                           int lev_min = 0, int lev_max = 0, int nGrow = 0, int local=0);
 
+    AMREX_FORCE_INLINE
+    int hostPartitionTile (ParticleTileType& src_tile,
+                           int lev, int gid, int tid,
+                           int lev_min, int lev_max, int nGrow, int local,
+                           bool remove_negative, int myproc,
+                           Gpu::DeviceVector<int>& boxes,
+                           Gpu::DeviceVector<int>& levels,
+                           Gpu::DeviceVector<int>& tiles,
+                           Gpu::DeviceVector<int>& src_indices,
+                           Gpu::DeviceVector<IntVect>& periodic_shift);
+
     template <typename P>
     void locateParticle (P& p, ParticleLocData& pld,
                          int lev_min, int lev_max, int nGrow, int local_grid=-1) const;
diff --git a/Src/Particle/AMReX_ParticleContainerBase.cpp b/Src/Particle/AMReX_ParticleContainerBase.cpp
index 5a93871e012..410607cf212 100644
--- a/Src/Particle/AMReX_ParticleContainerBase.cpp
+++ b/Src/Particle/AMReX_ParticleContainerBase.cpp
@@ -74,9 +74,9 @@ ParticleContainerBase::defineBufferMap () const
 {
     BL_PROFILE("ParticleContainer::defineBufferMap");
 
-    if (! m_buffer_map.isValid(GetParGDB()))
+    if (! m_buffer_map.isValid(GetParGDB(), do_tiling, tile_size))
     {
-        m_buffer_map.define(GetParGDB());
+        m_buffer_map.define(GetParGDB(), do_tiling, tile_size);
     }
 }
 
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index e916c4100b2..858400ee9dd 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -812,7 +812,7 @@ struct FilterVirt
     int operator() (const SrcData& src, int src_i) const noexcept
     {
         auto iv = getParticleCell(src, src_i, m_plo, m_dxi, m_domain);
-        return (m_assign_buffer_grid(iv)!=-1);
+        return (m_assign_buffer_grid(iv).first != -1);
     }
 };
 
@@ -894,7 +894,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         buffer.grow(aggregation_buffer);
 
         ParticleLocator<DenseBins<Box>> locator;
-        locator.build(buffer, geom);
+        locator.build(buffer, geom, do_tiling, tile_size);
         AssignGrid<DenseBinIteratorFactory<Box>> assign_buffer_grid = locator.getGridAssignor();
 
         amrex::ParticleToMesh(*this, mf, level,
@@ -904,7 +904,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                               amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi_loc)
         {
             auto iv = getParticleCell(p, plo_loc, dxi_loc, domain);
-            if(assign_buffer_grid(iv)==-1)
+            if(assign_buffer_grid(iv).first == -1)
             {
                //Ordering may make this not unique
                for (int i = 0; i < NArrayReal; ++i)
@@ -938,7 +938,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         {
 
             auto iv = getParticleCell(p, plo_loc, dxi_loc, domain);
-            if(assign_buffer_grid(iv)==-1)
+            if(assign_buffer_grid(iv).first == -1)
             {
                 //if this cell has no particle id info, do a straight copy to store idata
                 if(partData(iv,0)==0)
@@ -1056,7 +1056,7 @@ struct AssignGridFilter
         const auto tup_max = (m_assign_grid)(src[src_i], m_lev_max, m_lev_max, m_nGrow, DefaultAssignor{});
         const auto p_boxes = amrex::get<0>(tup_min);
         const auto p_boxes_max = amrex::get<0>(tup_max);
-        const auto p_levs_max  = amrex::get<1>(tup_max);
+        const auto p_levs_max  = amrex::get<2>(tup_max);
         return p_boxes_max >=0 && p_boxes == m_gid && p_levs_max == m_lev_max;
     }
 };
@@ -1101,8 +1101,8 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         return;
     }
 
-    if (! m_particle_locator.isValid(GetParGDB())) {
-        m_particle_locator.build(GetParGDB());
+    if (! m_particle_locator.isValid(GetParGDB(), do_tiling, tile_size)) {
+        m_particle_locator.build(GetParGDB(), do_tiling, tile_size);
     }
 
     m_particle_locator.setGeometry(GetParGDB());
@@ -1244,18 +1244,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
         "runtime real components for positions"
     );
 
-#ifdef AMREX_USE_GPU
-    if ( Gpu::inLaunchRegion() )
-    {
-        RedistributeGPU(lev_min, lev_max, nGrow, local, remove_negative);
-    }
-    else
-    {
-        RedistributeCPU(lev_min, lev_max, nGrow, local, remove_negative);
-    }
-#else
-    RedistributeCPU(lev_min, lev_max, nGrow, local, remove_negative);
-#endif
+    Redistribute_impl(lev_min, lev_max, nGrow, local, remove_negative);
 
     BL_PROFILE_SYNC_STOP();
 }
@@ -1354,23 +1343,105 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     }
 }
 
+template <typename ParticleType, int NArrayReal, int NArrayInt,
+          template<class> class Allocator, class CellAssignor>
+AMREX_FORCE_INLINE
+int
+ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>
+::hostPartitionTile (ParticleTileType& src_tile,
+                     int lev, int gid, int tid,
+                     int lev_min, int lev_max, int nGrow, int local,
+                     bool remove_negative, int myproc,
+                     Gpu::DeviceVector<int>& boxes,
+                     Gpu::DeviceVector<int>& levels,
+                     Gpu::DeviceVector<int>& tiles,
+                     Gpu::DeviceVector<int>& src_indices,
+                     Gpu::DeviceVector<IntVect>& periodic_shift)
+{
+    ParticleLocData pld;
+    auto ptd = src_tile.getParticleTileData();
+    int num_move = 0;
+
+    Long last = src_tile.numParticles() - 1;
+    Long pindex = 0;
+    int who = -1;
+    while (pindex <= last) {
+        decltype(auto) p = ptd[pindex];
+
+        // remove invalid if needed
+        if (!ptd.id(pindex).is_valid()) {
+            if (remove_negative) {
+                copyParticle(ptd, ptd, last, pindex);
+                correctCellVectors(last, pindex, gid, p);
+                --last;
+            } else {
+                ++pindex;
+            }
+            continue;
+        }
+
+        // Fast path: if on the finest level, try the last good location first
+        // If not on finest level, we need to do the full locate because even if
+        // the last assignment works, the particle might belong on a finer level.
+        if (lev == lev_max && pld.m_tile == tid && pld.m_grid == gid && pld.m_lev == lev && who == myproc)  {
+            const auto iv = Index(p, lev);
+            if (pld.m_tilebox.contains(iv)) {
+                ++pindex;
+                continue; // don't need correctCellVectors or particlePostLocate if it stays where it is
+            }
+        }
+
+        // full locate
+        locateParticle(p, pld, lev_min, lev_max, nGrow, local ? gid : -1);
+        particlePostLocate(p, pld, lev);
+
+        // particle may have been invalidated, so check if we need to remove again.
+        if (!ptd.id(pindex).is_valid()) {
+            copyParticle(ptd, ptd, last, pindex);
+            correctCellVectors(last, pindex, gid, p);
+            --last;
+            continue;
+        }
+
+        // flag particle for move if needed
+        who = BufferMap().procID(pld.m_grid, pld.m_tile, pld.m_lev);
+        if (pld.m_lev != lev || pld.m_grid != gid || pld.m_tile != tid || who != myproc) {
+            // the particle is valid but needs to be sent somewhere else
+            swapParticle(ptd, ptd, last, pindex);
+            correctCellVectors(last, pindex, gid, p);
+            boxes[num_move] = pld.m_grid;
+            levels[num_move] = pld.m_lev;
+            tiles[num_move] = pld.m_tile;
+            src_indices[num_move] = static_cast<int>(last);
+            ++num_move;
+            --last;
+            continue;
+        }
+
+        ++pindex; // if here, particle stays
+    }
+
+    boxes.resize(num_move);
+    levels.resize(num_move);
+    tiles.resize(num_move);
+    src_indices.resize(num_move);
+    periodic_shift.resize(num_move);
+
+    return static_cast<int>(pindex);
+}
+
 //
-// The GPU implementation of Redistribute
+// Shared implementation of Redistribute
 //
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
 void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>
-::RedistributeGPU (int lev_min, int lev_max, int nGrow, int local, bool remove_negative)
+::Redistribute_impl (int lev_min, int lev_max, int nGrow, int local, bool remove_negative)
 {
-#ifdef AMREX_USE_GPU
-
     if (local) { AMREX_ASSERT(numParticlesOutOfRange(*this, lev_min, lev_max, local) == 0); }
 
-    // sanity check
-    AMREX_ALWAYS_ASSERT(do_tiling == false);
-
-    BL_PROFILE("ParticleContainer::RedistributeGPU()");
+    BL_PROFILE("ParticleContainer::Redistribute_impl()");
     BL_PROFILE_VAR_NS("Redistribute_partition", blp_partition);
 
     int theEffectiveFinestLevel = m_gdb->finestLevel();
@@ -1399,69 +1470,189 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     this->defineBufferMap();
 
-    if (! m_particle_locator.isValid(GetParGDB())) { m_particle_locator.build(GetParGDB()); }
+#ifndef AMREX_USE_GPU
+    if (local > 0) { BuildRedistributeMask(0, local); }
+#else
+    if (! m_particle_locator.isValid(GetParGDB(), do_tiling, tile_size)) { m_particle_locator.build(GetParGDB(), do_tiling, tile_size); }
     m_particle_locator.setGeometry(GetParGDB());
-    auto assign_grid = m_particle_locator.getGridAssignor();
+#endif
 
     BL_PROFILE_VAR_START(blp_partition);
     ParticleCopyOp op;
     int num_levels = finest_lev_particles + 1;
     op.setNumLevels(num_levels);
-    Vector<std::map<int, int> > new_sizes(num_levels);
+    Vector<std::map<std::pair<int, int>, int> > new_sizes(num_levels);
+#ifndef AMREX_USE_GPU
+    const int myproc = ParallelContext::MyProcSub();
+#endif
+#ifdef AMREX_USE_GPU
+    auto assign_grid = m_particle_locator.getGridAssignor();
     const auto plo    = Geom(0).ProbLoArray();
     const auto phi    = Geom(0).ProbHiArray();
     const auto rlo    = Geom(0).ProbLoArrayInParticleReal();
     const auto rhi    = Geom(0).ProbHiArrayInParticleReal();
     const auto is_per = Geom(0).isPeriodicArray();
-    for (int lev = lev_min; lev <= finest_lev_particles; ++lev)
-    {
-        auto& plev = m_particles[lev];
-        for (auto& kv : plev)
-        {
-            int gid = kv.first.first;
-            int tid = kv.first.second;
-            auto index = std::make_pair(gid, tid);
+#endif
 
-            auto& src_tile = plev[index];
-            const size_t np = src_tile.numParticles();
+#if defined(AMREX_USE_OMP) || defined(AMREX_USE_GPU)
+    Vector<std::pair<int, int> > grid_tile_ids;
+    Vector<ParticleTileType*> ptile_ptrs;
+    Vector<int> plevs;
+    Vector<int*> new_size_ptrs;
+#ifndef AMREX_USE_GPU
+    Vector<Gpu::DeviceVector<int>*> box_ptrs;
+    Vector<Gpu::DeviceVector<int>*> level_ptrs;
+    Vector<Gpu::DeviceVector<int>*> tile_ptrs;
+    Vector<Gpu::DeviceVector<int>*> src_index_ptrs;
+    Vector<Gpu::DeviceVector<IntVect>*> periodic_shift_ptrs;
+#endif
+    std::size_t num_tiles = 0;
+    for (int lev = lev_min; lev <= finest_lev_particles; ++lev) {
+        for (auto const& kv : m_particles[lev]) {
+            if (kv.second.numParticles() != 0) {
+                ++num_tiles;
+            }
+        }
+    }
+    grid_tile_ids.reserve(num_tiles);
+    ptile_ptrs.reserve(num_tiles);
+    plevs.reserve(num_tiles);
+    new_size_ptrs.reserve(num_tiles);
+#ifndef AMREX_USE_GPU
+    box_ptrs.reserve(num_tiles);
+    level_ptrs.reserve(num_tiles);
+    tile_ptrs.reserve(num_tiles);
+    src_index_ptrs.reserve(num_tiles);
+    periodic_shift_ptrs.reserve(num_tiles);
+#endif
+    for (int lev = lev_min; lev <= finest_lev_particles; lev++) {
+        auto& pmap = m_particles[lev];
+        for (auto& kv : pmap)
+        {
+            const auto np = kv.second.numParticles();
+            if (np == 0) { continue; }
 
-            int num_stay = partitionParticlesByDest(src_tile, assign_grid,
-                                                    std::forward<CellAssignor>(CellAssignor{}),
-                                                    BufferMap(),
-                                                    plo, phi, rlo, rhi, is_per, lev, gid, tid,
-                                                    lev_min, lev_max, nGrow, remove_negative);
+            grid_tile_ids.push_back(kv.first);
+            ptile_ptrs.push_back(&(kv.second));
+            plevs.push_back(lev);
+            auto index = std::make_pair(kv.first.first, kv.first.second);
+            auto& new_size = new_sizes[lev][index];
+            new_size = 0;
+            new_size_ptrs.push_back(&new_size);
+#ifndef AMREX_USE_GPU
+            op.resize(kv.first.first, kv.first.second, lev, static_cast<int>(np));
+            auto& boxes = op.m_boxes[lev][index];
+            auto& levels = op.m_levels[lev][index];
+            auto& tiles = op.m_tiles[lev][index];
+            auto& src_indices = op.m_src_indices[lev][index];
+            auto& periodic_shift = op.m_periodic_shift[lev][index];
+            box_ptrs.push_back(&boxes);
+            level_ptrs.push_back(&levels);
+            tile_ptrs.push_back(&tiles);
+            src_index_ptrs.push_back(&src_indices);
+            periodic_shift_ptrs.push_back(&periodic_shift);
+#endif
+        }
+    }
+#endif
 
-            int num_move = np - num_stay;
-            new_sizes[lev][gid] = num_stay;
-            op.resize(gid, lev, num_move);
+#if defined(AMREX_USE_OMP) || defined(AMREX_USE_GPU)
+#ifdef AMREX_USE_OMP
+#pragma omp parallel for
+#endif
+    for (int pmap_it = 0; pmap_it < static_cast<int>(ptile_ptrs.size()); ++pmap_it)
+    {
+        int lev = plevs[pmap_it];
+        int gid = grid_tile_ids[pmap_it].first;
+        int tid = grid_tile_ids[pmap_it].second;
+        auto& src_tile = *ptile_ptrs[pmap_it];
+        const size_t np = src_tile.numParticles();
+        if (np == 0) {continue;}
 
-            auto p_boxes = op.m_boxes[lev][gid].dataPtr();
-            auto p_levs = op.m_levels[lev][gid].dataPtr();
-            auto p_src_indices = op.m_src_indices[lev][gid].dataPtr();
-            auto p_periodic_shift = op.m_periodic_shift[lev][gid].dataPtr();
-            auto ptd = src_tile.getParticleTileData();
+#ifndef AMREX_USE_GPU
+        auto& boxes = *box_ptrs[pmap_it];
+        auto& levels = *level_ptrs[pmap_it];
+        auto& tiles = *tile_ptrs[pmap_it];
+        auto& src_indices = *src_index_ptrs[pmap_it];
+        auto& periodic_shift = *periodic_shift_ptrs[pmap_it];
+        *new_size_ptrs[pmap_it] = hostPartitionTile(src_tile,
+                                                    lev, gid, tid,
+                                                    lev_min, lev_max, nGrow, local,
+                                                    remove_negative, myproc,
+                                                    boxes, levels, tiles,
+                                                    src_indices, periodic_shift);
+#else // GPU algorithm
+        int num_stay = partitionParticlesByDest(src_tile, assign_grid,
+                                                std::forward<CellAssignor>(CellAssignor{}),
+                                                BufferMap(),
+                                                plo, phi, rlo, rhi, is_per, lev, gid, tid,
+                                                lev_min, lev_max, nGrow, remove_negative);
+
+        int num_move = np - num_stay;
+        *new_size_ptrs[pmap_it] = num_stay;
+        op.resize(gid, tid, lev, num_move);
+
+        auto index = std::make_pair(gid, tid);
+        auto* p_boxes = op.m_boxes[lev][index].dataPtr();
+        auto* p_levs = op.m_levels[lev][index].dataPtr();
+        auto* p_tiles = op.m_tiles[lev][index].dataPtr();
+        auto* p_src_indices = op.m_src_indices[lev][index].dataPtr();
+        auto* p_periodic_shift = op.m_periodic_shift[lev][index].dataPtr();
+        auto ptd = src_tile.getParticleTileData();
+
+        amrex::ParallelFor(num_move, [=] AMREX_GPU_DEVICE (int i)
+        {
+            const auto p = ptd[i + num_stay];
 
-            AMREX_FOR_1D ( num_move, i,
+            if (!p.id().is_valid())
+            {
+                p_boxes[i] = -1;
+                p_tiles[i] = -1;
+                p_levs[i]  = -1;
+            }
+            else
             {
-                const auto p = ptd[i + num_stay];
+                const auto tup = assign_grid(p, lev_min, lev_max, nGrow,
+                                             std::forward<CellAssignor>(CellAssignor{}));
+                p_boxes[i] = amrex::get<0>(tup);
+                p_tiles[i] = amrex::get<1>(tup);
+                p_levs[i]  = amrex::get<2>(tup);
+            }
+            p_periodic_shift[i] = IntVect(AMREX_D_DECL(0,0,0));
+            p_src_indices[i] = i+num_stay;
+        });
+#endif
+    }
+#else
+    for (int lev = lev_min; lev <= finest_lev_particles; ++lev) {
+        auto& pmap = m_particles[lev];
+        for (auto& kv : pmap)
+        {
+            auto& src_tile = kv.second;
+            const auto np = src_tile.numParticles();
+            if (np == 0) { continue; }
 
-                if (!p.id().is_valid())
-                {
-                    p_boxes[i] = -1;
-                    p_levs[i]  = -1;
-                }
-                else
-                {
-                    const auto tup = assign_grid(p, lev_min, lev_max, nGrow,
-                                                 std::forward<CellAssignor>(CellAssignor{}));
-                    p_boxes[i] = amrex::get<0>(tup);
-                    p_levs[i]  = amrex::get<1>(tup);
-                }
-                p_periodic_shift[i] = IntVect(AMREX_D_DECL(0,0,0));
-                p_src_indices[i] = i+num_stay;
-            });
+            int gid = kv.first.first;
+            int tid = kv.first.second;
+            auto index = std::make_pair(gid, tid);
+            auto& new_size = new_sizes[lev][index];
+            new_size = 0;
+
+            op.resize(gid, tid, lev, static_cast<int>(np));
+            auto& boxes = op.m_boxes[lev][index];
+            auto& levels = op.m_levels[lev][index];
+            auto& tiles = op.m_tiles[lev][index];
+            auto& src_indices = op.m_src_indices[lev][index];
+            auto& periodic_shift = op.m_periodic_shift[lev][index];
+            new_size = hostPartitionTile(src_tile,
+                                         lev, gid, tid,
+                                         lev_min, lev_max, nGrow, local,
+                                         remove_negative, myproc,
+                                         boxes, levels, tiles,
+                                         src_indices, periodic_shift);
         }
     }
+#endif
     BL_PROFILE_VAR_STOP(blp_partition);
 
     ParticleCopyPlan plan;
@@ -1490,7 +1681,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             int tid = kv.first.second;
             auto index = std::make_pair(gid, tid);
             auto& tile = plev[index];
-            tile.resize(new_sizes[lev][gid]);
+            tile.resize(new_sizes[lev][index]);
         }
     }
 
@@ -1548,9 +1739,6 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     Gpu::Device::streamSynchronize();
     AMREX_ASSERT(numParticlesOutOfRange(*this, lev_min, lev_max, nGrow) == 0);
-#else
-    amrex::ignore_unused(lev_min,lev_max,nGrow,local,remove_negative);
-#endif
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -1568,7 +1756,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             int gid = mfi.index();
             int tid = mfi.LocalTileIndex();
             auto& tile = this->DefineAndReturnParticleTile(lev, gid, tid);
-            int num_copies = plan.m_box_counts_h[this->BufferMap().gridAndLevToBucket(gid, lev)];
+            int num_copies = plan.m_box_counts_h[this->BufferMap().gridAndTileAndLevToBucket(gid, tid, lev)];
             if (num_copies > 0) {
                 addsizes[&tile] += num_copies;
             }
@@ -1576,12 +1764,11 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     }
 
     if (plan.m_nrcvs > 0) {
-        AMREX_ALWAYS_ASSERT(do_tiling == false);
         for (int i = 0, N = int(plan.m_rcv_box_counts.size()); i < N; ++i) {
             int copy_size = plan.m_rcv_box_counts[i];
             int lev = plan.m_rcv_box_levs[i];
             int gid = plan.m_rcv_box_ids[i];
-            int tid = 0; // It's always 0 because this function is for RedistributeGPU only and the tiling is off.
+            int tid = plan.m_rcv_box_tids[i];
             auto& tile = this->DefineAndReturnParticleTile(lev, gid, tid);
             addsizes[&tile] += copy_size;
         }
@@ -1590,669 +1777,6 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     ParticleTileType::reserve(addsizes);
 }
 
-//
-// The CPU implementation of Redistribute
-//
-template <typename ParticleType, int NArrayReal, int NArrayInt,
-          template<class> class Allocator, class CellAssignor>
-void
-ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>
-::RedistributeCPU (int lev_min, int lev_max, int nGrow, int local, bool remove_negative)
-{
-    BL_PROFILE("ParticleContainer::RedistributeCPU()");
-
-    const int MyProc    = ParallelContext::MyProcSub();
-    auto      strttime  = amrex::second();
-
-    if (local > 0) { BuildRedistributeMask(0, local); }
-
-    // On startup there are cases where Redistribute() could be called
-    // with a given finestLevel() where that AmrLevel has yet to be defined.
-    int theEffectiveFinestLevel = m_gdb->finestLevel();
-
-    while (!m_gdb->LevelDefined(theEffectiveFinestLevel)) {
-        theEffectiveFinestLevel--;
-    }
-
-    if (int(m_particles.size()) < theEffectiveFinestLevel+1) {
-        if (Verbose()) {
-            amrex::Print() << "ParticleContainer::Redistribute() resizing containers from "
-                           << m_particles.size() << " to "
-                           << theEffectiveFinestLevel + 1 << '\n';
-        }
-        m_particles.resize(theEffectiveFinestLevel+1);
-        m_dummy_mf.resize(theEffectiveFinestLevel+1);
-    }
-
-    // It is important to do this even if we don't have more levels because we may have changed the
-    // grids at this level in a regrid.
-    for (int lev = 0; lev < theEffectiveFinestLevel+1; ++lev) {
-        RedefineDummyMF(lev);
-    }
-
-    int finest_lev_particles;
-    if (lev_max == -1) {
-        lev_max = theEffectiveFinestLevel;
-        finest_lev_particles = m_particles.size() - 1;
-    } else {
-        finest_lev_particles = lev_max;
-    }
-    AMREX_ASSERT(lev_max <= finestLevel());
-
-    // This will hold the valid particles that go to another process
-    std::map<int, Vector<char> > not_ours;
-
-    int num_threads = OpenMP::get_max_threads();
-
-    // these are temporary buffers for each thread
-    std::map<int, Vector<Vector<char> > > tmp_remote;
-    Vector<std::map<std::pair<int, int>, Vector<ParticleTileType> > > ptile_local;
-    ptile_local.resize(theEffectiveFinestLevel+1);
-
-    // we resize these buffers outside the parallel region
-    for (int lev = lev_min; lev <= lev_max; lev++) {
-        for (MFIter mfi(*m_dummy_mf[lev], this->do_tiling ? this->tile_size : IntVect::TheZeroVector());
-             mfi.isValid(); ++mfi) {
-            auto index = std::make_pair(mfi.index(), mfi.LocalTileIndex());
-            ptile_local[lev][index].resize(num_threads);
-            for (int t = 0; t < num_threads; ++t) {
-                ptile_local[lev][index][t].define(m_num_runtime_real, m_num_runtime_int,
-                    &m_soa_rdata_names, &m_soa_idata_names, arena());
-            }
-        }
-    }
-    if (local) {
-        for (int i = 0; i < neighbor_procs.size(); ++i) {
-            tmp_remote[neighbor_procs[i]].resize(num_threads);
-        }
-    } else {
-        for (int i = 0; i < ParallelContext::NProcsSub(); ++i) {
-            tmp_remote[i].resize(num_threads);
-        }
-    }
-
-    // first pass: for each tile in parallel, in each thread copies the particles that
-    // need to be moved into it's own, temporary buffer.
-    for (int lev = lev_min; lev <= finest_lev_particles; lev++) {
-        auto& pmap = m_particles[lev];
-
-        Vector<std::pair<int, int> > grid_tile_ids;
-        Vector<ParticleTileType*> ptile_ptrs;
-        for (auto& kv : pmap)
-        {
-            grid_tile_ids.push_back(kv.first);
-            ptile_ptrs.push_back(&(kv.second));
-        }
-
-#ifdef AMREX_USE_OMP
-#pragma omp parallel for
-#endif
-        for (int pmap_it = 0; pmap_it < static_cast<int>(ptile_ptrs.size()); ++pmap_it)
-        {
-            int thread_num = OpenMP::get_thread_num();
-            int grid = grid_tile_ids[pmap_it].first;
-            int tile = grid_tile_ids[pmap_it].second;
-
-            unsigned npart = ptile_ptrs[pmap_it]->numParticles();
-            ParticleLocData pld;
-
-            auto particle_tile = ptile_ptrs[pmap_it];
-            auto ptd = particle_tile->getParticleTileData();
-
-            if (npart != 0) {
-                Long last = npart - 1;
-                Long pindex = 0;
-                while (pindex <= last) {
-                    decltype(auto) p = ptd[pindex];
-
-                    if ((remove_negative == false) && (!ptd.id(pindex).is_valid())) {
-                        ++pindex;
-                        continue;
-                    }
-
-                    if (!ptd.id(pindex).is_valid()) {
-                        copyParticle(ptd, ptd, last, pindex);
-                        correctCellVectors(last, pindex, grid, p);
-                        --last;
-                        continue;
-                    }
-
-                    locateParticle(p, pld, lev_min, lev_max, nGrow, local ? grid : -1);
-
-                    particlePostLocate(p, pld, lev);
-
-                    if (!ptd.id(pindex).is_valid()) {
-                        copyParticle(ptd, ptd, last, pindex);
-                        correctCellVectors(last, pindex, grid, p);
-                        --last;
-                        continue;
-                    }
-
-                    const int who = ParallelContext::global_to_local_rank(ParticleDistributionMap(pld.m_lev)[pld.m_grid]);
-                    if (who == MyProc) {
-                        if (pld.m_lev != lev || pld.m_grid != grid || pld.m_tile != tile) {
-                            // We own it but must shift it to another place.
-                            auto index = std::make_pair(pld.m_grid, pld.m_tile);
-                            AMREX_ASSERT(ptile_local[pld.m_lev][index].size() == num_threads);
-
-                            auto& dst = ptile_local[pld.m_lev][index][thread_num];
-                            auto old_size = dst.size();
-                            auto new_size = old_size + 1;
-                            dst.resize(new_size, GrowthStrategy::Geometric);
-                            auto dst_ptd = dst.getParticleTileData();
-
-                            copyParticle(dst_ptd, ptd, pindex, old_size);
-
-                            ptd.id(pindex).make_invalid(); // Invalidate the particle
-                        }
-                    } else {
-                        auto& particles_to_send = tmp_remote[who][thread_num];
-                        auto old_size = particles_to_send.size();
-                        auto new_size = old_size + superparticle_size;
-                        particles_to_send.resize(new_size);
-
-                        ptd.packParticleData(particles_to_send.data(), pindex, old_size,
-                            h_redistribute_real_comp.data(), h_redistribute_int_comp.data());
-
-                        ptd.id(pindex).make_invalid(); // Invalidate the particle
-                    }
-
-                    if (!ptd.id(pindex).is_valid()) {
-                        copyParticle(ptd, ptd, last, pindex);
-                        correctCellVectors(last, pindex, grid, p);
-                        --last;
-                        continue;
-                    }
-
-                    ++pindex;
-                }
-
-                Long tot_npart = particle_tile->numTotalParticles();
-
-                if (last != npart - 1) {
-                    pindex = last + 1;
-                    last = npart;
-                    while (last < tot_npart) {
-                        copyParticle(ptd, ptd, last, pindex);
-                        ++pindex;
-                        ++last;
-                    }
-                    particle_tile->resize(pindex);
-                }
-            }
-        }
-    }
-
-    for (int lev = lev_min; lev <= lev_max; lev++) {
-        particle_detail::clearEmptyEntries(m_particles[lev]);
-    }
-
-    // Second pass - for each tile in parallel, collect the particles we are owed from all thread's buffers.
-    for (int lev = lev_min; lev <= lev_max; lev++) {
-
-        Vector<std::pair<int, int> > grid_tile_ids;
-
-        // we need to create any missing map entries in serial here
-        for (auto pmap_it = ptile_local[lev].begin(); pmap_it != ptile_local[lev].end(); pmap_it++)
-        {
-            DefineAndReturnParticleTile(lev, pmap_it->first.first, pmap_it->first.second);
-            grid_tile_ids.push_back(pmap_it->first);
-        }
-
-#ifdef AMREX_USE_OMP
-#pragma omp parallel for
-#endif
-        for (int pit = 0; pit < static_cast<int>(grid_tile_ids.size()); ++pit) // NOLINT(modernize-loop-convert)
-        {
-            auto index = grid_tile_ids[pit];
-            auto& dst_ptile = ParticlesAt(lev, index.first, index.second);
-            auto& src_ptile = ptile_local[lev][index];
-
-            for (int i = 0; i < num_threads; ++i) {
-
-                Long to_copy = src_ptile[i].numParticles();
-                Long old_size = dst_ptile.numTotalParticles();
-                Long new_size = old_size + to_copy;
-                dst_ptile.resize(new_size);
-
-                auto dst_ptd = dst_ptile.getParticleTileData();
-                auto src_ptd = src_ptile[i].getParticleTileData();
-
-                for (Long j = 0; j < to_copy; ++j) {
-                    copyParticle(dst_ptd, src_ptd, j, j + old_size);
-                }
-
-                src_ptile[i].resize(0);
-                src_ptile[i].shrink_to_fit();
-            }
-        }
-    }
-
-    for (auto& map_it : tmp_remote) {
-        int who = map_it.first;
-        not_ours[who];
-    }
-
-    Vector<int> dest_proc_ids;
-    Vector<Vector<Vector<char> >* > pbuff_ptrs;
-    for (auto& kv : tmp_remote)
-    {
-        dest_proc_ids.push_back(kv.first);
-        pbuff_ptrs.push_back(&(kv.second));
-    }
-
-#ifdef AMREX_USE_OMP
-#pragma omp parallel for
-#endif
-    for (int pmap_it = 0; pmap_it < static_cast<int>(pbuff_ptrs.size()); ++pmap_it)
-    {
-        int who = dest_proc_ids[pmap_it];
-        Vector<Vector<char> >& tmp = *(pbuff_ptrs[pmap_it]);
-        for (int i = 0; i < num_threads; ++i) {
-            not_ours[who].insert(not_ours[who].end(), tmp[i].begin(), tmp[i].end());
-            tmp[i].erase(tmp[i].begin(), tmp[i].end());
-        }
-    }
-
-    particle_detail::clearEmptyEntries(not_ours);
-
-    if (int(m_particles.size()) > theEffectiveFinestLevel+1) {
-        // Looks like we lost an AmrLevel on a regrid.
-        if (m_verbose > 0) {
-            amrex::Print() << "ParticleContainer::Redistribute() resizing m_particles from "
-                           << m_particles.size() << " to " << theEffectiveFinestLevel+1 << '\n';
-        }
-        AMREX_ASSERT(int(m_particles.size()) >= 2);
-
-        m_particles.resize(theEffectiveFinestLevel + 1);
-        m_dummy_mf.resize(theEffectiveFinestLevel + 1);
-    }
-
-    if (ParallelContext::NProcsSub() == 1) {
-        AMREX_ASSERT(not_ours.empty());
-    }
-    else {
-        RedistributeMPI(not_ours, lev_min, lev_max, nGrow, local);
-    }
-
-    AMREX_ASSERT(OK(lev_min, lev_max, nGrow));
-
-    if (m_verbose > 0) {
-        auto stoptime = amrex::second() - strttime;
-
-        ByteSpread();
-
-#ifdef AMREX_LAZY
-        Lazy::QueueReduction( [=] () mutable {
-#endif
-            ParallelReduce::Max(stoptime, ParallelContext::IOProcessorNumberSub(),
-                                ParallelContext::CommunicatorSub());
-
-            amrex::Print() << "ParticleContainer::Redistribute() time: " << stoptime << "\n\n";
-#ifdef AMREX_LAZY
-        });
-#endif
-    }
-}
-
-template <typename ParticleType, int NArrayReal, int NArrayInt,
-          template<class> class Allocator, class CellAssignor>
-void
-ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>::
-RedistributeMPI (std::map<int, Vector<char> >& not_ours,
-                 int lev_min, int lev_max, int nGrow, int local)
-{
-    BL_PROFILE("ParticleContainer::RedistributeMPI()");
-    BL_PROFILE_VAR_NS("RedistributeMPI_locate", blp_locate);
-    BL_PROFILE_VAR_NS("RedistributeMPI_copy", blp_copy);
-
-#ifdef AMREX_USE_MPI
-
-    using buffer_type = unsigned long long;
-
-    std::map<int, Vector<buffer_type> > mpi_snd_data;
-    for (const auto& kv : not_ours)
-    {
-        auto nbt = (kv.second.size() + sizeof(buffer_type)-1)/sizeof(buffer_type);
-        mpi_snd_data[kv.first].resize(nbt);
-        std::memcpy((char*) mpi_snd_data[kv.first].data(), kv.second.data(), kv.second.size());
-    }
-
-    const int NProcs = ParallelContext::NProcsSub();
-    const int NNeighborProcs = neighbor_procs.size();
-
-    // We may now have particles that are rightfully owned by another CPU.
-    Vector<Long> Snds(NProcs, 0), Rcvs(NProcs, 0);  // bytes!
-
-    Long NumSnds = 0;
-    if (local > 0)
-    {
-        AMREX_ALWAYS_ASSERT(lev_min == 0);
-        AMREX_ALWAYS_ASSERT(lev_max == 0);
-        BuildRedistributeMask(0, local);
-        NumSnds = doHandShakeLocal(not_ours, neighbor_procs, Snds, Rcvs);
-    }
-    else
-    {
-        NumSnds = doHandShake(not_ours, Snds, Rcvs);
-    }
-
-    const int SeqNum = ParallelDescriptor::SeqNum();
-
-    if ((! local) && NumSnds == 0) {
-        return;  // There's no parallel work to do.
-    }
-
-    if (local)
-    {
-        Long tot_snds_this_proc = 0;
-        Long tot_rcvs_this_proc = 0;
-        for (int i = 0; i < NNeighborProcs; ++i) {
-            tot_snds_this_proc += Snds[neighbor_procs[i]];
-            tot_rcvs_this_proc += Rcvs[neighbor_procs[i]];
-        }
-        if ( (tot_snds_this_proc == 0) && (tot_rcvs_this_proc == 0) ) {
-            return; // There's no parallel work to do.
-        }
-    }
-
-    Vector<int> RcvProc;
-    Vector<std::size_t> rOffset; // Offset (in bytes) in the receive buffer
-
-    std::size_t TotRcvInts = 0;
-    std::size_t TotRcvBytes = 0;
-    for (int i = 0; i < NProcs; ++i) {
-        if (Rcvs[i] > 0) {
-            RcvProc.push_back(i);
-            rOffset.push_back(TotRcvInts);
-            TotRcvBytes += Rcvs[i];
-            auto nbt = (Rcvs[i] + sizeof(buffer_type)-1)/sizeof(buffer_type);
-            TotRcvInts += nbt;
-        }
-    }
-
-    const auto nrcvs = static_cast<int>(RcvProc.size());
-    Vector<MPI_Status>  stats(nrcvs);
-    Vector<MPI_Request> rreqs(nrcvs);
-
-    // Allocate data for rcvs as one big chunk.
-    Vector<unsigned long long> recvdata(TotRcvInts);
-
-    // Post receives.
-    for (int i = 0; i < nrcvs; ++i) {
-        const auto Who    = RcvProc[i];
-        const auto offset = rOffset[i];
-        const auto Cnt = (Rcvs[Who] + sizeof(buffer_type)-1)/sizeof(buffer_type);
-        AMREX_ASSERT(Cnt > 0);
-        AMREX_ASSERT(Cnt < size_t(std::numeric_limits<int>::max()));
-        AMREX_ASSERT(Who >= 0 && Who < NProcs);
-
-        rreqs[i] = ParallelDescriptor::Arecv(&recvdata[offset], Cnt, Who, SeqNum,
-                                             ParallelContext::CommunicatorSub()).req();
-    }
-
-    // Send.
-    for (const auto& kv : mpi_snd_data) {
-        const auto Who = kv.first;
-        const auto Cnt = kv.second.size();
-
-        AMREX_ASSERT(Cnt > 0);
-        AMREX_ASSERT(Who >= 0 && Who < NProcs);
-        AMREX_ASSERT(Cnt < std::numeric_limits<int>::max());
-
-        ParallelDescriptor::Send(kv.second.data(), Cnt, Who, SeqNum,
-                                 ParallelContext::CommunicatorSub());
-    }
-
-    if (nrcvs > 0) {
-        ParallelDescriptor::Waitall(rreqs, stats);
-
-        BL_PROFILE_VAR_START(blp_locate);
-
-        int npart = TotRcvBytes / superparticle_size;
-
-        Vector<int> rcv_levs(npart);
-        Vector<int> rcv_grid(npart);
-        Vector<int> rcv_tile(npart);
-
-        int ipart = 0;
-        ParticleLocData pld;
-        for (int j = 0; j < nrcvs; ++j)
-        {
-            const auto offset = rOffset[j];
-            const auto Who    = RcvProc[j];
-            const auto Cnt    = Rcvs[Who] / superparticle_size;
-            for (int i = 0; i < int(Cnt); ++i)
-            {
-                char* pbuf = ((char*) &recvdata[offset]) + i*superparticle_size;
-
-                Particle<NStructReal, NStructInt> p;
-
-                if constexpr (ParticleType::is_soa_particle) {
-                    std::memcpy(&p.m_idcpu, pbuf, sizeof(uint64_t));
-
-                    ParticleReal pos[AMREX_SPACEDIM];
-                    std::memcpy(&pos[0], pbuf + sizeof(uint64_t), AMREX_SPACEDIM*sizeof(ParticleReal));
-                    AMREX_D_TERM(p.pos(0) = pos[0];,
-                                 p.pos(1) = pos[1];,
-                                 p.pos(2) = pos[2]);
-                } else {
-                   std::memcpy(&p, pbuf, sizeof(ParticleType));
-                }
-
-                bool success = Where(p, pld, lev_min, lev_max, 0);
-                if (!success)
-                {
-                    success = (nGrow > 0) && Where(p, pld, lev_min, lev_min, nGrow);
-                    pld.m_grown_gridbox = pld.m_gridbox; // reset grown box for subsequent calls.
-                }
-                if (!success)
-                {
-                    amrex::Abort("RedistributeMPI_locate:: invalid particle.");
-                }
-
-                rcv_levs[ipart] = pld.m_lev;
-                rcv_grid[ipart] = pld.m_grid;
-                rcv_tile[ipart] = pld.m_tile;
-
-                ++ipart;
-            }
-        }
-
-        BL_PROFILE_VAR_STOP(blp_locate);
-
-        BL_PROFILE_VAR_START(blp_copy);
-
-#ifndef AMREX_USE_GPU
-        ipart = 0;
-        for (int i = 0; i < nrcvs; ++i)
-        {
-            const auto offset = rOffset[i];
-            const auto Who    = RcvProc[i];
-            const auto Cnt = Rcvs[Who] / superparticle_size;
-            for (int j = 0; j < int(Cnt); ++j)
-            {
-                auto& ptile = m_particles[rcv_levs[ipart]][std::make_pair(rcv_grid[ipart],
-                                                                          rcv_tile[ipart])];
-                auto old_size = ptile.numTotalParticles();
-                auto new_size = old_size + 1;
-                ptile.resize(new_size, GrowthStrategy::Geometric);
-
-                char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
-
-                if constexpr (ParticleType::is_soa_particle) {
-                    uint64_t idcpudata;
-                    std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
-                    pbuf += sizeof(uint64_t);
-                    ptile.GetStructOfArrays().GetIdCPUData()[old_size] = idcpudata;
-                } else {
-                    ParticleType p;
-                    std::memcpy(&p, pbuf, sizeof(ParticleType));
-                    pbuf += sizeof(ParticleType);
-                    ptile.GetArrayOfStructs()[old_size] = p;
-                }
-
-                int array_comp_start = 0;
-                if constexpr (!ParticleType::is_soa_particle) {
-                    array_comp_start = AMREX_SPACEDIM + NStructReal;
-                }
-                for (int comp = 0; comp < NumRealComps(); ++comp) {
-                    if (h_redistribute_real_comp[array_comp_start + comp]) {
-                        ParticleReal rdata;
-                        std::memcpy(&rdata, pbuf, sizeof(ParticleReal));
-                        pbuf += sizeof(ParticleReal);
-                        ptile.GetStructOfArrays().GetRealData(comp)[old_size] = rdata;
-                    } else {
-                        ptile.GetStructOfArrays().GetRealData(comp)[old_size] = 0.0;
-                    }
-                }
-
-                array_comp_start = 2 + NStructInt;
-                for (int comp = 0; comp < NumIntComps(); ++comp) {
-                    if (h_redistribute_int_comp[array_comp_start + comp]) {
-                        int idata;
-                        std::memcpy(&idata, pbuf, sizeof(int));
-                        pbuf += sizeof(int);
-                        ptile.GetStructOfArrays().GetIntData(comp)[old_size] = idata;
-                    } else {
-                        ptile.GetStructOfArrays().GetIntData(comp)[old_size] = 0;
-                    }
-                }
-                ++ipart;
-            }
-        }
-
-#else
-        Vector<std::map<std::pair<int, int>, Gpu::HostVector<ParticleType> > > host_particles;
-        host_particles.reserve(15);
-        host_particles.resize(finestLevel()+1);
-
-        Vector<std::map<std::pair<int, int>,
-                        std::vector<Gpu::HostVector<ParticleReal> > > > host_real_attribs;
-        host_real_attribs.reserve(15);
-        host_real_attribs.resize(finestLevel()+1);
-
-        Vector<std::map<std::pair<int, int>,
-                        std::vector<Gpu::HostVector<int> > > > host_int_attribs;
-        host_int_attribs.reserve(15);
-        host_int_attribs.resize(finestLevel()+1);
-
-        Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
-        host_idcpu.reserve(15);
-        host_idcpu.resize(finestLevel()+1);
-
-        ipart = 0;
-        for (int i = 0; i < nrcvs; ++i)
-        {
-            const auto offset = rOffset[i];
-            const auto Who    = RcvProc[i];
-            const auto Cnt = Rcvs[Who] / superparticle_size;
-            for (auto j = decltype(Cnt)(0); j < Cnt; ++j)
-            {
-                int lev = rcv_levs[ipart];
-                std::pair<int, int> ind(std::make_pair(rcv_grid[ipart], rcv_tile[ipart]));
-
-                char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
-
-                host_real_attribs[lev][ind].resize(NumRealComps());
-                host_int_attribs[lev][ind].resize(NumIntComps());
-
-                if constexpr (ParticleType::is_soa_particle) {
-                    uint64_t idcpudata;
-                    std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
-                    pbuf += sizeof(uint64_t);
-                    host_idcpu[lev][ind].push_back(idcpudata);
-                } else {
-                    ParticleType p;
-                    std::memcpy(&p, pbuf, sizeof(ParticleType));
-                    pbuf += sizeof(ParticleType);
-                    host_particles[lev][ind].push_back(p);
-                }
-
-                host_real_attribs[lev][ind].resize(NumRealComps());
-                host_int_attribs[lev][ind].resize(NumIntComps());
-
-                // add the real...
-                int array_comp_start = 0;
-                if constexpr (!ParticleType::is_soa_particle) {
-                    array_comp_start = AMREX_SPACEDIM + NStructReal;
-                }
-                for (int comp = 0; comp < NumRealComps(); ++comp) {
-                    if (h_redistribute_real_comp[array_comp_start + comp]) {
-                        ParticleReal rdata;
-                        std::memcpy(&rdata, pbuf, sizeof(ParticleReal));
-                        pbuf += sizeof(ParticleReal);
-                        host_real_attribs[lev][ind][comp].push_back(rdata);
-                    } else {
-                        host_real_attribs[lev][ind][comp].push_back(0.0);
-                    }
-                }
-
-                // ... and int array data
-                array_comp_start = 2 + NStructInt;
-                for (int comp = 0; comp < NumIntComps(); ++comp) {
-                    if (h_redistribute_int_comp[array_comp_start + comp]) {
-                        int idata;
-                        std::memcpy(&idata, pbuf, sizeof(int));
-                        pbuf += sizeof(int);
-                        host_int_attribs[lev][ind][comp].push_back(idata);
-                    } else {
-                        host_int_attribs[lev][ind][comp].push_back(0);
-                    }
-                }
-                ++ipart;
-            }
-        }
-
-        for (int host_lev = 0; host_lev < static_cast<int>(host_particles.size()); ++host_lev)
-        {
-            for (auto& kv : host_particles[host_lev]) {
-              auto grid = kv.first.first;
-              auto tile = kv.first.second;
-              const auto& src_tile = kv.second;
-
-              auto& dst_tile = GetParticles(host_lev)[std::make_pair(grid,tile)];
-              auto old_size = dst_tile.size();
-              auto new_size = old_size + src_tile.size();
-              dst_tile.resize(new_size);
-
-              if constexpr (ParticleType::is_soa_particle) {
-                  Gpu::copyAsync(Gpu::hostToDevice,
-                                 host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
-                                 host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
-                                 dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
-              } else {
-                  Gpu::copyAsync(Gpu::hostToDevice,
-                                 src_tile.begin(), src_tile.end(),
-                                 dst_tile.GetArrayOfStructs().begin() + old_size);
-              }
-
-              for (int i = 0; i < NumRealComps(); ++i) {
-                  Gpu::copyAsync(Gpu::hostToDevice,
-                                 host_real_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                                 host_real_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                                 dst_tile.GetStructOfArrays().GetRealData(i).begin() + old_size);
-              }
-
-              for (int i = 0; i < NumIntComps(); ++i) {
-                  Gpu::copyAsync(Gpu::hostToDevice,
-                                 host_int_attribs[host_lev][std::make_pair(grid,tile)][i].begin(),
-                                 host_int_attribs[host_lev][std::make_pair(grid,tile)][i].end(),
-                                 dst_tile.GetStructOfArrays().GetIntData(i).begin() + old_size);
-              }
-            }
-          }
-
-        Gpu::Device::streamSynchronize();
-#endif
-
-        BL_PROFILE_VAR_STOP(blp_copy);
-    }
-#else
-    amrex::ignore_unused(not_ours,lev_min,lev_max,nGrow,local);
-#endif
-}
-
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator, class CellAssignor>
 bool
diff --git a/Src/Particle/AMReX_ParticleLocator.H b/Src/Particle/AMReX_ParticleLocator.H
index aae714a868e..75fa01ae972 100644
--- a/Src/Particle/AMReX_ParticleLocator.H
+++ b/Src/Particle/AMReX_ParticleLocator.H
@@ -4,8 +4,11 @@
 
 #include <AMReX_ParGDB.H>
 #include <AMReX_GpuContainers.H>
+#include <AMReX_ParticleUtil.H>
 #include <AMReX_Tuple.H>
 
+#include <utility>
+
 namespace amrex
 {
 
@@ -22,17 +25,21 @@ struct AssignGrid
     Box m_domain;
     GpuArray<Real, AMREX_SPACEDIM> m_plo;
     GpuArray<Real, AMREX_SPACEDIM> m_dxi;
+    Dim3 m_tile_size;
+    bool m_do_tiling = false;
 
     AMREX_GPU_HOST_DEVICE
     AssignGrid () = default;
 
     AssignGrid (BinIteratorFactory a_bif,
                 const IntVect& a_bins_lo, const IntVect& a_bins_hi, const IntVect& a_bin_size,
-                const IntVect& a_num_bins, const Geometry& a_geom)
+                const IntVect& a_num_bins, const Geometry& a_geom,
+                bool a_do_tiling, const IntVect& a_tile_size)
         : m_bif(a_bif),
           m_lo(a_bins_lo.dim3()), m_hi(a_bins_hi.dim3()), m_bin_size(a_bin_size.dim3()),
           m_num_bins(a_num_bins.dim3()), m_domain(a_geom.Domain()),
-          m_plo(a_geom.ProbLoArray()), m_dxi(a_geom.InvCellSizeArray())
+          m_plo(a_geom.ProbLoArray()), m_dxi(a_geom.InvCellSizeArray()),
+          m_tile_size(a_tile_size.dim3()), m_do_tiling(a_do_tiling)
         {
             // clamp bin size and num_bins to 1 for AMREX_SPACEDIM < 3
             if (m_bin_size.x >= 0) {m_bin_size.x = amrex::max(m_bin_size.x, 1);}
@@ -44,19 +51,27 @@ struct AssignGrid
             if (m_bin_size.z >= 0) {m_num_bins.z = amrex::max(m_num_bins.z, 1);}
         }
 
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    int getTile (const IntVect& iv, const Box& bx) const noexcept
+    {
+        Box tbx;
+        const IntVect tile_size(AMREX_D_DECL(m_tile_size.x, m_tile_size.y, m_tile_size.z));
+        return getTileIndex(iv, bx, m_do_tiling, tile_size, tbx);
+    }
+
     template <typename P, typename Assignor = DefaultAssignor>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int operator() (const P& p, int nGrow=0, Assignor const& assignor = Assignor{}) const noexcept
+    std::pair<int, int> operator() (const P& p, int nGrow=0, Assignor const& assignor = Assignor{}) const noexcept
     {
         const auto iv = assignor(p, m_plo, m_dxi, m_domain);
         return this->operator()(iv, nGrow);
     }
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    int operator() (const IntVect& iv, int nGrow=0) const noexcept
+    std::pair<int, int> operator() (const IntVect& iv, int nGrow=0) const noexcept
     {
         if (AMREX_D_TERM((m_num_bins.x == 0), && (m_num_bins.y == 0), && (m_num_bins.z == 0))) {
-            return -1;
+            return {-1, -1};
         }
         const auto lo = iv.dim3();
         int ix_lo = amrex::max((lo.x - nGrow - m_lo.x) / m_bin_size.x - 1, 0);
@@ -66,7 +81,8 @@ struct AssignGrid
         int ix_hi = amrex::min((lo.x + nGrow - m_lo.x) / m_bin_size.x, m_num_bins.x-1);
         int iy_hi = amrex::min((lo.y + nGrow - m_lo.y) / m_bin_size.y, m_num_bins.y-1);
         int iz_hi = amrex::min((lo.z + nGrow - m_lo.z) / m_bin_size.z, m_num_bins.z-1);
-        int loc = -1;
+        int loc_gid = -1;
+        int loc_tid = -1;
         for (int ii = ix_lo; ii <= ix_hi; ++ii) {
             for (int jj = iy_lo; jj <= iy_hi; ++jj) {
                 for (int kk = iz_lo; kk <= iz_hi; ++kk) {
@@ -74,28 +90,31 @@ struct AssignGrid
                     for (const auto& nbor : m_bif.getBinIterator(index)) {
                         Box bx = nbor.second;
                         if (bx.contains(iv)) {
-                          return nbor.first;
+                            return {nbor.first, getTile(iv, bx)};
                         }
                         Box gbx = bx;
                         gbx.grow(nGrow);
                         if (gbx.contains(iv)) {
-                           if (loc < 0) {
-                             loc = nbor.first;
-                           }
-                           // Prefer particle not in corner ghost cells
-                           for (int dir = 0; dir < AMREX_SPACEDIM; ++dir) {
-                               Box gdbx = bx;
-                               gdbx.grow(dir, nGrow);
-                               if (gdbx.contains(iv)) {
-                                  loc = nbor.first;
-                               }
-                           }
+                            int tile = getTile(iv, bx);
+                            if (loc_gid < 0) {
+                                loc_gid = nbor.first;
+                                loc_tid = tile;
+                            }
+                            // Prefer particle not in corner ghost cells
+                            for (int dir = 0; dir < AMREX_SPACEDIM; ++dir) {
+                                Box gdbx = bx;
+                                gdbx.grow(dir, nGrow);
+                                if (gdbx.contains(iv)) {
+                                    loc_gid = nbor.first;
+                                    loc_tid = tile;
+                                }
+                            }
                         }
                     }
                 }
             }
         }
-        return loc;
+        return {loc_gid, loc_tid};
     }
 };
 
@@ -108,9 +127,13 @@ public:
 
     ParticleLocator ()  = default;
 
-    void build (const BoxArray& ba, const Geometry& geom)
+    void build (const BoxArray& ba, const Geometry& geom,
+                bool a_do_tiling = false,
+                const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000)))
     {
         m_defined = true;
+        m_do_tiling = a_do_tiling;
+        m_tile_size = a_tile_size;
         m_ba = ba;
         m_geom = geom;
         int num_boxes = static_cast<int>(ba.size());
@@ -184,21 +207,30 @@ public:
     {
         AMREX_ASSERT(m_defined);
         return AssignGrid<BinIteratorFactory>(m_bins.getBinIteratorFactory(),
-                                              m_bins_lo, m_bins_hi, m_bin_size, m_num_bins, m_geom);
+                                              m_bins_lo, m_bins_hi, m_bin_size, m_num_bins, m_geom,
+                                              m_do_tiling, m_tile_size);
     }
 
-    bool isValid (const BoxArray& ba) const noexcept
+    bool isValid (const BoxArray& ba,
+                  bool a_do_tiling = false,
+                  const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) const noexcept
     {
-        if (m_defined) { return BoxArray::SameRefs(m_ba, ba); }
+        if (m_defined) {
+            return BoxArray::SameRefs(m_ba, ba) &&
+                (m_do_tiling == a_do_tiling) &&
+                (m_tile_size == a_tile_size);
+        }
         return false;
     }
 
 protected:
 
     bool m_defined{false};
+    bool m_do_tiling{false};
 
     BoxArray m_ba;
     Geometry m_geom;
+    IntVect m_tile_size{AMREX_D_DECL(1024000, 1024000, 1024000)};
 
     IntVect m_bins_lo;
     IntVect m_bins_hi;
@@ -223,25 +255,25 @@ struct AmrAssignGrid
 
     template <typename P, typename Assignor = DefaultAssignor>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    GpuTuple<int, int> operator() (const P& p, int lev_min=-1, int lev_max=-1, int nGrow=0,
-                                   Assignor const& assignor = {}) const noexcept
+    GpuTuple<int, int, int> operator() (const P& p, int lev_min=-1, int lev_max=-1, int nGrow=0,
+                                        Assignor const& assignor = {}) const noexcept
     {
         lev_min = (lev_min == -1) ? 0 : lev_min;
         lev_max = (lev_max == -1) ? m_size - 1 : lev_max;
 
         for (int lev = lev_max; lev >= lev_min; --lev)
         {
-            int grid = m_funcs[lev](p, 0, assignor);
-            if (grid >= 0) { return makeTuple(grid, lev); }
+            auto grid_tile = m_funcs[lev](p, 0, assignor);
+            if (grid_tile.first >= 0) { return makeTuple(grid_tile.first, grid_tile.second, lev); }
         }
 
         {
             int lev = lev_min;
-            int grid = m_funcs[lev](p, nGrow, assignor);
-            if (grid >= 0) { return makeTuple(grid, lev); }
+            auto grid_tile = m_funcs[lev](p, nGrow, assignor);
+            if (grid_tile.first >= 0) { return makeTuple(grid_tile.first, grid_tile.second, lev); }
         }
 
-        return makeTuple(-1, -1);
+        return makeTuple(-1, -1, -1);
     }
 };
 
@@ -261,18 +293,24 @@ public:
     AmrParticleLocator() = default;
 
     AmrParticleLocator(const Vector<BoxArray>& a_ba,
-                       const Vector<Geometry>& a_geom)
+                       const Vector<Geometry>& a_geom,
+                       bool a_do_tiling = false,
+                       const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000)))
     {
-        build(a_ba, a_geom);
+        build(a_ba, a_geom, a_do_tiling, a_tile_size);
     }
 
-    AmrParticleLocator(const ParGDBBase* a_gdb)
+    AmrParticleLocator(const ParGDBBase* a_gdb,
+                       bool a_do_tiling = false,
+                       const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000)))
     {
-        build(a_gdb);
+        build(a_gdb, a_do_tiling, a_tile_size);
     }
 
     void build (const Vector<BoxArray>& a_ba,
-                const Vector<Geometry>& a_geom)
+                const Vector<Geometry>& a_geom,
+                bool a_do_tiling = false,
+                const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000)))
     {
         m_defined = true;
         int num_levels = static_cast<int>(a_ba.size());
@@ -282,7 +320,7 @@ public:
         Gpu::HostVector<AssignGrid<BinIteratorFactory> > h_grid_assignors(num_levels);
         for (int lev = 0; lev < num_levels; ++lev)
         {
-            m_locators[lev].build(a_ba[lev], a_geom[lev]);
+            m_locators[lev].build(a_ba[lev], a_geom[lev], a_do_tiling, a_tile_size);
             h_grid_assignors[lev] = m_locators[lev].getGridAssignor();
         }
         Gpu::htod_memcpy_async(m_grid_assignors.data(), h_grid_assignors.data(),
@@ -291,13 +329,15 @@ public:
 #else
         for (int lev = 0; lev < num_levels; ++lev)
         {
-            m_locators[lev].build(a_ba[lev], a_geom[lev]);
+            m_locators[lev].build(a_ba[lev], a_geom[lev], a_do_tiling, a_tile_size);
             m_grid_assignors[lev] = m_locators[lev].getGridAssignor();
         }
 #endif
     }
 
-    void build (const ParGDBBase* a_gdb)
+    void build (const ParGDBBase* a_gdb,
+                bool a_do_tiling = false,
+                const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000)))
     {
         Vector<BoxArray> ba;
         Vector<Geometry> geom;
@@ -307,29 +347,33 @@ public:
             ba.push_back(a_gdb->ParticleBoxArray(lev));
             geom.push_back(a_gdb->Geom(lev));
         }
-        build(ba, geom);
+        build(ba, geom, a_do_tiling, a_tile_size);
     }
 
-    [[nodiscard]] bool isValid (const Vector<BoxArray>& a_ba) const
+    [[nodiscard]] bool isValid (const Vector<BoxArray>& a_ba,
+                                bool a_do_tiling = false,
+                                const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) const
     {
         if ( !m_defined || (m_locators.empty()) ||
              (m_locators.size() != a_ba.size()) ) { return false; }
         bool all_valid = true;
         int num_levels = m_locators.size();
         for (int lev = 0; lev < num_levels; ++lev) {
-            all_valid = all_valid && m_locators[lev].isValid(a_ba[lev]);
+            all_valid = all_valid && m_locators[lev].isValid(a_ba[lev], a_do_tiling, a_tile_size);
         }
         return all_valid;
     }
 
-    bool isValid (const ParGDBBase* a_gdb) const
+    bool isValid (const ParGDBBase* a_gdb,
+                  bool a_do_tiling = false,
+                  const IntVect& a_tile_size = IntVect(AMREX_D_DECL(1024000, 1024000, 1024000))) const
     {
         Vector<BoxArray> ba;
         int num_levels = a_gdb->finestLevel()+1;
         for (int lev = 0; lev < num_levels; ++lev) {
             ba.push_back(a_gdb->ParticleBoxArray(lev));
         }
-        return this->isValid(ba);
+        return this->isValid(ba, a_do_tiling, a_tile_size);
     }
 
     void setGeometry (const ParGDBBase* a_gdb)
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index 9b8e055d031..c781dbd3266 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -550,7 +550,7 @@ partitionParticles (PTile& ptile, ParFunc const& is_left)
     // it compensates for the extra kernel launches and evaluations of is_left which this
     // algorithm needs.
 
-    ParallelFor(max_num_swaps,
+    ParallelForOMP(max_num_swaps,
         [=] AMREX_GPU_DEVICE (int i)
         {
             int left_i = p_index_left[i];
@@ -618,7 +618,7 @@ partitionParticles (PTile& ptile, int num_left, ParFunc const& is_left)
         },
         Scan::Type::exclusive, Scan::noRetSum);
 
-    ParallelFor(max_num_swaps,
+    ParallelForOMP(max_num_swaps,
         [=] AMREX_GPU_DEVICE (int i)
         {
             int left_i = p_index_left[i];
@@ -643,8 +643,6 @@ removeInvalidParticles (PTile& ptile)
     ptile.resize(new_size);
 }
 
-#if defined(AMREX_USE_GPU)
-
 template <typename PTile, typename PLocator, typename CellAssignor>
 int
 partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const& assignor,
@@ -654,7 +652,7 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const
                           const GpuArray<ParticleReal,AMREX_SPACEDIM>& rlo,
                           const GpuArray<ParticleReal,AMREX_SPACEDIM>& rhi,
                           const GpuArray<int ,AMREX_SPACEDIM>& is_per,
-                          int lev, int gid, int /*tid*/,
+                          int lev, int gid, int tid,
                           int lev_min, int lev_max, int nGrow, bool remove_negative)
 {
     auto getPID = pmap.getPIDFunctor();
@@ -666,15 +664,17 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const
 
     // the function for determining if a particle stays on this grid is very slow,
     // so we cache it in particle_stays to avoid evaluating it multiple times.
-    ParallelFor(ptile.numParticles(),
+    ParallelForOMP(ptile.numParticles(),
         [=] AMREX_GPU_DEVICE (int i)
         {
             int assigned_grid;
+            int assigned_tile;
             int assigned_lev;
 
             if (!ptd.id(i).is_valid())
             {
                 assigned_grid = -1;
+                assigned_tile = -1;
                 assigned_lev  = -1;
             }
             else
@@ -683,7 +683,8 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const
                 enforcePeriodic(p_prime, plo, phi, rlo, rhi, is_per);
                 auto tup_prime = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
                 assigned_grid = amrex::get<0>(tup_prime);
-                assigned_lev  = amrex::get<1>(tup_prime);
+                assigned_tile = amrex::get<1>(tup_prime);
+                assigned_lev  = amrex::get<2>(tup_prime);
                 if (assigned_grid >= 0)
                 {
                     AMREX_D_TERM(ptd.pos(0, i) = p_prime.pos(0);,
@@ -697,13 +698,15 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const
                                  p_prime.pos(2) = ptd.pos(2, i););
                     auto tup = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
                     assigned_grid = amrex::get<0>(tup);
-                    assigned_lev  = amrex::get<1>(tup);
+                    assigned_tile = amrex::get<1>(tup);
+                    assigned_lev  = amrex::get<2>(tup);
                 }
             }
 
             p_particle_stays[i] = uint8_t(
                 ((remove_negative == false) && (!ptd.id(i).is_valid())) ||
-                ((assigned_grid == gid) && (assigned_lev == lev) && (getPID(lev, gid) == pid)));
+                ((assigned_grid == gid) && (assigned_tile == tid) &&
+                 (assigned_lev == lev) && (getPID(lev, gid, tid) == pid)));
         });
 
     return partitionParticles(ptile,
@@ -712,8 +715,6 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const
         });
 }
 
-#endif
-
 template <class PC1, class PC2>
 bool SameIteratorsOK (const PC1& pc1, const PC2& pc2) {
     if (pc1.numLevels() != pc2.numLevels()) {return false;}
diff --git a/Tests/Particles/Intersection/main.cpp b/Tests/Particles/Intersection/main.cpp
index 48ccb3c0bcd..a021107b0a7 100644
--- a/Tests/Particles/Intersection/main.cpp
+++ b/Tests/Particles/Intersection/main.cpp
@@ -92,7 +92,7 @@ void testIntersection()
             auto* const grids_ptr = device_grids.dataPtr();
             amrex::ParallelFor(num_cells, [=] AMREX_GPU_DEVICE (int j) noexcept
             {
-                grids_ptr[j] = assign_grid(cells_ptr[j]);
+                grids_ptr[j] = assign_grid(cells_ptr[j]).first;
             });
 
             ReduceOps<ReduceOpSum> reduce_op;
diff --git a/Tests/Particles/ParallelContext/main.cpp b/Tests/Particles/ParallelContext/main.cpp
index 4a55193eb27..b37fb752cac 100644
--- a/Tests/Particles/ParallelContext/main.cpp
+++ b/Tests/Particles/ParallelContext/main.cpp
@@ -73,6 +73,7 @@ class TestParticleContainer
         const int lev = 0;  // only add particles on level 0
         const Real* dx = Geom(lev).CellSize();
         const Real* plo = Geom(lev).ProbLo();
+        const IntVect dom_lo = Geom(lev).Domain().smallEnd();
 
         const int num_ppc = AMREX_D_TERM( a_num_particles_per_cell[0],
                                          *a_num_particles_per_cell[1],
@@ -98,12 +99,12 @@ class TestParticleContainer
                     ParticleType p;
                     p.id()  = ParticleType::NextID();
                     p.cpu() = ParallelDescriptor::MyProc();
-                    p.pos(0) = static_cast<ParticleReal> (plo[0] + (iv[0] + r[0])*dx[0]);
+                    p.pos(0) = static_cast<ParticleReal> (plo[0] + ((iv[0] - dom_lo[0]) + r[0])*dx[0]);
 #if AMREX_SPACEDIM > 1
-                    p.pos(1) = static_cast<ParticleReal> (plo[1] + (iv[1] + r[1])*dx[1]);
+                    p.pos(1) = static_cast<ParticleReal> (plo[1] + ((iv[1] - dom_lo[1]) + r[1])*dx[1]);
 #endif
 #if AMREX_SPACEDIM > 2
-                    p.pos(2) = static_cast<ParticleReal> (plo[2] + (iv[2] + r[2])*dx[2]);
+                    p.pos(2) = static_cast<ParticleReal> (plo[2] + ((iv[2] - dom_lo[2]) + r[2])*dx[2]);
 #endif
 
                     for (int i = 0; i < NSR; ++i) { p.rdata(i) = ParticleReal(p.id()); }
diff --git a/Tests/Particles/Redistribute/CMakeLists.txt b/Tests/Particles/Redistribute/CMakeLists.txt
index cdae8d13d47..891631a3f24 100644
--- a/Tests/Particles/Redistribute/CMakeLists.txt
+++ b/Tests/Particles/Redistribute/CMakeLists.txt
@@ -8,6 +8,34 @@ foreach(D IN LISTS AMReX_SPACEDIM)
 
     setup_test(${D} _sources _input_files)
 
+    if (D EQUAL 3 AND NOT AMReX_GPU_BACKEND STREQUAL NONE)
+      set(_exe_dir ${CMAKE_CURRENT_BINARY_DIR}/${D}d)
+      set(_exe_name Test_Particles_Redistribute_${D}d)
+
+      file(COPY inputs.rt.cuda.tiling DESTINATION ${_exe_dir})
+
+      if (AMReX_OMP)
+        add_test(
+          NAME              Particles_Redistribute_${D}d_Tiling
+          COMMAND           ${_exe_dir}/${_exe_name} inputs.rt.cuda.tiling
+          WORKING_DIRECTORY ${_exe_dir}
+        )
+        set_tests_properties(Particles_Redistribute_${D}d_Tiling PROPERTIES ENVIRONMENT OMP_NUM_THREADS=2)
+      elseif (AMReX_MPI)
+        add_test(
+          NAME              Particles_Redistribute_${D}d_Tiling
+          COMMAND           mpiexec -n 2 ${_exe_dir}/${_exe_name} inputs.rt.cuda.tiling
+          WORKING_DIRECTORY ${_exe_dir}
+        )
+      else ()
+        add_test(
+          NAME              Particles_Redistribute_${D}d_Tiling
+          COMMAND           ${_exe_dir}/${_exe_name} inputs.rt.cuda.tiling
+          WORKING_DIRECTORY ${_exe_dir}
+        )
+      endif ()
+    endif ()
+
     unset(_sources)
     unset(_input_files)
 endforeach()
diff --git a/Tests/Particles/Redistribute/inputs.rt.cuda.tiling b/Tests/Particles/Redistribute/inputs.rt.cuda.tiling
new file mode 100644
index 00000000000..62a114f03ac
--- /dev/null
+++ b/Tests/Particles/Redistribute/inputs.rt.cuda.tiling
@@ -0,0 +1,14 @@
+redistribute.size = (32, 64, 64)
+redistribute.max_grid_size = 32
+redistribute.is_periodic = 1
+redistribute.num_ppc = 1
+redistribute.move_dir = (1, 1, 1)
+redistribute.do_random = 1
+redistribute.nsteps = 100
+redistribute.nlevs = 1
+redistribute.do_regrid = 1
+
+redistribute.num_runtime_real = 2
+redistribute.num_runtime_int = 3
+
+particles.do_tiling=1
\ No newline at end of file
diff --git a/Tests/Particles/Redistribute/main.cpp b/Tests/Particles/Redistribute/main.cpp
index adc65598eba..a9aede8a161 100644
--- a/Tests/Particles/Redistribute/main.cpp
+++ b/Tests/Particles/Redistribute/main.cpp
@@ -90,6 +90,9 @@ class TestParticleContainer
                                          *a_num_particles_per_cell[1],
                                          *a_num_particles_per_cell[2]);
 
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
         for(MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi)
         {
             const Box& tile_box  = mfi.tilebox();
@@ -189,6 +192,9 @@ class TestParticleContainer
     {
         BL_PROFILE("TestParticleContainer::moveParticles");
 
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
         for (int lev = 0; lev <= finestLevel(); ++lev)
         {
             const auto dx = Geom(lev).CellSizeArray();
@@ -241,6 +247,9 @@ class TestParticleContainer
     {
         BL_PROFILE("TestParticleContainer::invalidateEven");
 
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
         for (int lev = 0; lev <= finestLevel(); ++lev)
         {
             auto& plev  = GetParticles(lev);
@@ -272,6 +281,9 @@ class TestParticleContainer
         int num_rr = NumRuntimeRealComps();
         int num_ii = NumRuntimeIntComps();
 
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
         for (int lev = 0; lev <= finestLevel(); ++lev)
         {
             const auto& plev  = GetParticles(lev);