diff --git a/Examples/CMakeLists.txt b/Examples/CMakeLists.txt
index cb8057a8a71..b3e1fac1ad5 100644
--- a/Examples/CMakeLists.txt
+++ b/Examples/CMakeLists.txt
@@ -145,6 +145,8 @@ function(add_warpx_test
         )
         # FIXME Use helper function to handle Windows exceptions
         set_property(TEST ${name}.run APPEND PROPERTY ENVIRONMENT "PYTHONPATH=$ENV{PYTHONPATH}:${CMAKE_PYTHON_OUTPUT_DIRECTORY}")
+        # allocate GPU memory on-demand for Python tests too (same reason as native tests below)
+        set_property(TEST ${name}.run APPEND PROPERTY ENVIRONMENT "AMREX_THE_ARENA_INIT_SIZE=0")
     else()
         # TODO Use these for Python tests too
         set(runtime_params
diff --git a/Examples/Physics_applications/pierce_diode/analysis_default_regression.py b/Examples/Physics_applications/pierce_diode/analysis_default_regression.py
index e143e396f0c..4563b7cd3c6 100755
--- a/Examples/Physics_applications/pierce_diode/analysis_default_regression.py
+++ b/Examples/Physics_applications/pierce_diode/analysis_default_regression.py
@@ -76,6 +76,7 @@ def main(args):
             OpenPMDTimeSeries(args.path)
         except Exception:
             print("Could not open the file as a plotfile or an openPMD time series")
+            args.format = "openpmd"
         else:
             args.format = "openpmd"
     else:
diff --git a/Examples/Tests/electrostatic_sphere_eb/analysis_rz_mr.py b/Examples/Tests/electrostatic_sphere_eb/analysis_rz_mr.py
index 55365bd4c76..3a623d702bc 100755
--- a/Examples/Tests/electrostatic_sphere_eb/analysis_rz_mr.py
+++ b/Examples/Tests/electrostatic_sphere_eb/analysis_rz_mr.py
@@ -26,7 +26,7 @@
 def find_first_non_zero_from_bottom_left(matrix):
     for i in range(matrix.shape[0]):
         for j in range(matrix.shape[1]):
-            if (matrix[i][j] != 0) and (matrix[i][j] != np.nan):
+            if (matrix[i][j] != 0) and (not np.isnan(matrix[i][j])):
                 return (i, j)
     return i, j
 
@@ -34,7 +34,7 @@ def find_first_non_zero_from_bottom_left(matrix):
 def find_first_non_zero_from_upper_right(matrix):
     for i in range(matrix.shape[0] - 1, -1, -1):
         for j in range(matrix.shape[1] - 1, -1, -1):
-            if (matrix[i][j] != 0) and (matrix[i][j] != np.nan):
+            if (matrix[i][j] != 0) and (not np.isnan(matrix[i][j])):
                 return (i, j)
     return i, j
 
diff --git a/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb b/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb
index c0cd729f50c..146b21d8ab2 100644
--- a/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb
+++ b/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb
@@ -15,7 +15,7 @@ my_constants.dt = 0.1/wpe        # s
 #################################
 max_step = 20
 amr.n_cell =  40 40
-amr.max_grid_size = 8
+amr.max_grid_size = 16
 amr.blocking_factor = 8
 amr.max_level = 0
 geometry.dims = 2
diff --git a/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb_filtered b/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb_filtered
index c7457e02af8..99c8a1f0468 100644
--- a/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb_filtered
+++ b/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb_filtered
@@ -15,7 +15,7 @@ my_constants.dt = 0.1/wpe        # s
 #################################
 max_step = 20
 amr.n_cell =  40 40
-amr.max_grid_size = 8
+amr.max_grid_size = 16
 amr.blocking_factor = 8
 amr.max_level = 0
 geometry.dims = 2
diff --git a/Examples/Tests/langmuir/inputs_test_2d_langmuir_multi_mr_psatd b/Examples/Tests/langmuir/inputs_test_2d_langmuir_multi_mr_psatd
index cf95a07e2fc..668654d8007 100644
--- a/Examples/Tests/langmuir/inputs_test_2d_langmuir_multi_mr_psatd
+++ b/Examples/Tests/langmuir/inputs_test_2d_langmuir_multi_mr_psatd
@@ -4,6 +4,7 @@ FILE = inputs_base_2d
 # test input parameters
 algo.maxwell_solver = psatd
 amr.max_level = 1
+amr.max_grid_size = 128
 amr.ref_ratio = 4
 diag1.electrons.variables = x z w ux uy uz
 diag1.positrons.variables = x z w ux uy uz
diff --git a/Examples/Tests/magnetostatic_eb/inputs_test_3d_magnetostatic_eb b/Examples/Tests/magnetostatic_eb/inputs_test_3d_magnetostatic_eb
index 73ff14dc8a4..45c17e7984a 100644
--- a/Examples/Tests/magnetostatic_eb/inputs_test_3d_magnetostatic_eb
+++ b/Examples/Tests/magnetostatic_eb/inputs_test_3d_magnetostatic_eb
@@ -29,7 +29,7 @@ algo.particle_shape = 1
 algo.current_deposition = direct
 
 particles.species_names = beam
-particles.do_tiling = 1
+particles.do_tiling = 0
 beam.mass = m_e
 beam.charge = -q_e
 beam.injection_style = nuniformpercell
diff --git a/Examples/analysis_default_regression.py b/Examples/analysis_default_regression.py
index 00cc4e31536..1078c352aec 100755
--- a/Examples/analysis_default_regression.py
+++ b/Examples/analysis_default_regression.py
@@ -74,6 +74,7 @@ def main(args):
             OpenPMDTimeSeries(args.path)
         except Exception:
             print("Could not open the file as a plotfile or an openPMD time series")
+            args.format = "openpmd"
         else:
             args.format = "openpmd"
     else:
diff --git a/Examples/analysis_default_restart.py b/Examples/analysis_default_restart.py
index ad6bc22e60e..a12dcb1becf 100755
--- a/Examples/analysis_default_restart.py
+++ b/Examples/analysis_default_restart.py
@@ -49,11 +49,22 @@ def check_restart(filename, tolerance=1e-12):
         dims=ds_benchmark.domain_dimensions,
     )
 
-    # Loop over all fields (all particle species, all particle attributes, all grid fields)
-    # and compare output data generated from initial run with output data generated after restart
+    # Separate grid fields from particle fields. Particle fields use the
+    # species name as field type; grid fields use 'boxlib'.
+    particle_species = set()
+    grid_fields = []
+    for field in ds_benchmark.field_list:
+        ftype, fname = field
+        if ftype == "boxlib":
+            grid_fields.append(field)
+        elif ftype != "all":
+            particle_species.add(ftype)
+
     print(f"\ntolerance = {tolerance}")
     print()
-    for field in ds_benchmark.field_list:
+
+    # Compare grid fields directly (order is deterministic)
+    for field in grid_fields:
         dr = ad_restart[field].squeeze().v
         db = ad_benchmark[field].squeeze().v
         error = np.amax(np.abs(dr - db))
@@ -61,6 +72,31 @@ def check_restart(filename, tolerance=1e-12):
             error /= np.amax(np.abs(db))
         print(f"field: {field}; error = {error}")
         assert error < tolerance
+
+    # Compare particle fields sorted by (particle_cpu, particle_id), since
+    # Redistribute() after checkpoint-restart may reorder particles across
+    # tiles/ranks. The (cpu, id) pair is the unique particle key in AMReX.
+    for species in sorted(particle_species):
+        species_fields = [f for f in ds_benchmark.field_list if f[0] == species]
+
+        id_r = np.atleast_1d(ad_restart[(species, "particle_id")].squeeze().v)
+        id_b = np.atleast_1d(ad_benchmark[(species, "particle_id")].squeeze().v)
+        cpu_r = np.atleast_1d(ad_restart[(species, "particle_cpu")].squeeze().v)
+        cpu_b = np.atleast_1d(ad_benchmark[(species, "particle_cpu")].squeeze().v)
+
+        sort_r = np.lexsort((id_r, cpu_r))
+        sort_b = np.lexsort((id_b, cpu_b))
+
+        for field in species_fields:
+            if field[1] in ("particle_id", "particle_cpu"):
+                continue
+            dr = np.atleast_1d(ad_restart[field].squeeze().v)[sort_r]
+            db = np.atleast_1d(ad_benchmark[field].squeeze().v)[sort_b]
+            error = np.amax(np.abs(dr - db))
+            if np.amax(np.abs(db)) != 0.0:
+                error /= np.amax(np.abs(db))
+            print(f"field: {field}; error = {error}")
+            assert error < tolerance
     print()
 
 
diff --git a/Python/pywarpx/_libwarpx.py b/Python/pywarpx/_libwarpx.py
index 4b033e36239..dce36d5a554 100755
--- a/Python/pywarpx/_libwarpx.py
+++ b/Python/pywarpx/_libwarpx.py
@@ -71,6 +71,17 @@ def load_library(self):
                 "Please write separate scripts for each geometry."
             )
 
+        # Import mpi4py before the pyAMReX (amrex.space*d) shared library is
+        # loaded so that mpi4py calls MPI_Init_thread first.  With the Cray
+        # MPICH on Polaris/Sirius, if the AMReX shared library is loaded before
+        # mpi4py initializes MPI, mpi4py's later MPI_Init_thread conflicts with
+        # the already-loaded MPI symbols and causes hangs or unbounded memory
+        # allocation during amrex::Initialize().
+        try:
+            from mpi4py import MPI  # noqa: F811,F401
+        except ImportError:
+            pass  # mpi4py is optional; MPI_Init handled by AMReX if absent
+
         # --- Use geometry to determine whether to import the 1D, 2D, 3D or RZ version.
         # --- The geometry must be setup before the lib warpx shared object can be loaded.
         try:
@@ -147,7 +158,7 @@ def load_library(self):
         register_warpx_WarpXParticleContainer_extension(self.libwarpx_so)
 
     def amrex_init(self, argv, mpi_comm=None):
-        if mpi_comm is None:  # or MPI is None:
+        if mpi_comm is None:
             self.libwarpx_so.amrex_init(argv)
         else:
             raise Exception("mpi_comm argument not yet supported")
@@ -172,9 +183,28 @@ def finalize(self, finalize_mpi=1):
         """
         # TODO: simplify, part of pyAMReX already
         if self.initialized:
+            # GPU finalization workaround: on GPU backends, destroying the C++
+            # WarpX object (del self.warpx) or calling amrex_finalize() can
+            # crash in SYCL/CUDA/HIP Device::Finalize() when it tries to
+            # synchronize streams whose backing static objects are already gone.
+            # Exit immediately via os._exit(0) BEFORE any C++ destructors run,
+            # and let the job launcher (PALS/mpiexec) handle cleanup.
+            try:
+                if self.libwarpx_so.Config.gpu_backend in ("SYCL", "CUDA", "HIP"):
+                    try:
+                        from mpi4py import MPI
+
+                        MPI.COMM_WORLD.Barrier()
+                    except Exception:
+                        pass  # best-effort barrier; proceeding to _exit
+                    os._exit(0)
+            except Exception:
+                pass  # Config may not be available; fall through to normal cleanup
+
             del self.warpx
             # The call to warpx_finalize causes a crash - don't know why
             # self.libwarpx_so.warpx_finalize()
+
             self.libwarpx_so.amrex_finalize()
 
             from pywarpx import callbacks
diff --git a/Source/Diagnostics/WarpXOpenPMD.cpp b/Source/Diagnostics/WarpXOpenPMD.cpp
index 06b446f6158..0634cfc89b0 100644
--- a/Source/Diagnostics/WarpXOpenPMD.cpp
+++ b/Source/Diagnostics/WarpXOpenPMD.cpp
@@ -593,6 +593,10 @@ for (const auto & particle_diag : particle_diags) {
         particlesConvertUnits(ConvertDirection::SI_to_WarpX, pc, mass);
     }
 
+    // On GPU backends, copyParticles may be asynchronous; synchronize before
+    // passing pinned-memory pointers to openPMD's storeChunkRaw.
+    amrex::Gpu::streamSynchronize();
+
     // Gather the electrostatic potential (phi) on the macroparticles
     if ( particle_diag.m_plot_phi ) {
         storePhiOnParticles( tmp, WarpX::electrostatic_solver_id, !use_pinned_pc );
diff --git a/Source/Particles/ParticleCreation/DefaultInitialization.H b/Source/Particles/ParticleCreation/DefaultInitialization.H
index 76b40cf5252..81dac7295ef 100644
--- a/Source/Particles/ParticleCreation/DefaultInitialization.H
+++ b/Source/Particles/ParticleCreation/DefaultInitialization.H
@@ -14,6 +14,7 @@
 #   include "Particles/ElementaryProcess/QEDInternals/QuantumSyncEngineWrapper.H"
 #endif
 
+#include <AMReX_GpuAllocators.H>
 #include <AMReX_GpuContainers.H>
 #include <AMReX_REAL.H>
 
@@ -21,6 +22,23 @@
 #include <map>
 #include <string>
 
+namespace ParticleCreation {
+/** Whether particle data with allocator Alloc lives on the GPU.
+ *  amrex::RunOnGpu is not specialised for PolymorphicArenaAllocator,
+ *  so we extend the check here.  PolymorphicArenaAllocator in WarpX
+ *  always wraps a device arena, so it is safe to treat it as GPU. */
+template <typename Alloc>
+constexpr bool particles_on_gpu ()
+{
+#ifdef AMREX_USE_GPU
+    return amrex::RunOnGpu<Alloc>::value
+        || amrex::IsPolymorphicArenaAllocator<Alloc>::value;
+#else
+    return false;
+#endif
+}
+} // namespace ParticleCreation
+
 /**
  * \brief This set of initialization policies describes what happens
  * when we need to create a new particle due to an elementary process.
@@ -158,7 +176,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile,
                 const QuantumSynchrotronGetOpticalDepth quantum_sync_get_opt =
                                                 p_qs_engine->build_optical_depth_functor();
                 // If the particle tile was allocated in a memory pool that can run on GPU, launch GPU kernel
-                if constexpr (amrex::RunOnGpu<typename PTile::template AllocatorType<amrex::Real>>::value) {
+                if constexpr (ParticleCreation::particles_on_gpu<typename PTile::template AllocatorType<amrex::Real>>()) {
                         amrex::ParallelForRNG(stop - start,
                                               [=] AMREX_GPU_DEVICE (int i, amrex::RandomEngine const& engine) noexcept {
                                                   const int ip = i + start;
@@ -185,7 +203,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile,
                 const BreitWheelerGetOpticalDepth breit_wheeler_get_opt =
                                                 p_bw_engine->build_optical_depth_functor();;
                 // If the particle tile was allocated in a memory pool that can run on GPU, launch GPU kernel
-                if constexpr (amrex::RunOnGpu<typename PTile::template AllocatorType<amrex::Real>>::value) {
+                if constexpr (ParticleCreation::particles_on_gpu<typename PTile::template AllocatorType<amrex::Real>>()) {
                         amrex::ParallelForRNG(stop - start,
                                               [=] AMREX_GPU_DEVICE (int i, amrex::RandomEngine const& engine) noexcept {
                                                   const int ip = i + start;
@@ -214,7 +232,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile,
                     const amrex::ParserExecutor<7> user_real_attrib_parserexec =
                                              user_real_attrib_parser[ia]->compile<7>();
                     // If the particle tile was allocated in a memory pool that can run on GPU, launch GPU kernel
-                    if constexpr (amrex::RunOnGpu<typename PTile::template AllocatorType<amrex::Real>>::value) {
+                    if constexpr (ParticleCreation::particles_on_gpu<typename PTile::template AllocatorType<amrex::Real>>()) {
                         amrex::ParallelFor(stop - start,
                                            [=] AMREX_GPU_DEVICE (int i) noexcept {
                                                const int ip = i + start;
@@ -246,7 +264,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile,
             if (it_ioniz != particle_icomps.end() &&
                 std::distance(particle_icomps.begin(), it_ioniz) == j)
             {
-                if constexpr (amrex::RunOnGpu<typename PTile::template AllocatorType<int>>::value) {
+                if constexpr (ParticleCreation::particles_on_gpu<typename PTile::template AllocatorType<int>>()) {
                         amrex::ParallelFor(stop - start,
                                            [=] AMREX_GPU_DEVICE (int i) noexcept {
                                                const int ip = i + start;
@@ -268,7 +286,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile,
                 {
                      const amrex::ParserExecutor<7> user_int_attrib_parserexec =
                                              user_int_attrib_parser[ia]->compile<7>();
-                if constexpr (amrex::RunOnGpu<typename PTile::template AllocatorType<int>>::value) {
+                if constexpr (ParticleCreation::particles_on_gpu<typename PTile::template AllocatorType<int>>()) {
                         amrex::ParallelFor(stop - start,
                                            [=] AMREX_GPU_DEVICE (int i) noexcept {
                                                const int ip = i + start;