diff --git a/Examples/CMakeLists.txt b/Examples/CMakeLists.txt index cb8057a8a71..b3e1fac1ad5 100644 --- a/Examples/CMakeLists.txt +++ b/Examples/CMakeLists.txt @@ -145,6 +145,8 @@ function(add_warpx_test ) # FIXME Use helper function to handle Windows exceptions set_property(TEST ${name}.run APPEND PROPERTY ENVIRONMENT "PYTHONPATH=$ENV{PYTHONPATH}:${CMAKE_PYTHON_OUTPUT_DIRECTORY}") + # allocate GPU memory on-demand for Python tests too (same reason as native tests below) + set_property(TEST ${name}.run APPEND PROPERTY ENVIRONMENT "AMREX_THE_ARENA_INIT_SIZE=0") else() # TODO Use these for Python tests too set(runtime_params diff --git a/Examples/Physics_applications/pierce_diode/analysis_default_regression.py b/Examples/Physics_applications/pierce_diode/analysis_default_regression.py index e143e396f0c..4563b7cd3c6 100755 --- a/Examples/Physics_applications/pierce_diode/analysis_default_regression.py +++ b/Examples/Physics_applications/pierce_diode/analysis_default_regression.py @@ -76,6 +76,7 @@ def main(args): OpenPMDTimeSeries(args.path) except Exception: print("Could not open the file as a plotfile or an openPMD time series") + args.format = "openpmd" else: args.format = "openpmd" else: diff --git a/Examples/Tests/electrostatic_sphere_eb/analysis_rz_mr.py b/Examples/Tests/electrostatic_sphere_eb/analysis_rz_mr.py index 55365bd4c76..3a623d702bc 100755 --- a/Examples/Tests/electrostatic_sphere_eb/analysis_rz_mr.py +++ b/Examples/Tests/electrostatic_sphere_eb/analysis_rz_mr.py @@ -26,7 +26,7 @@ def find_first_non_zero_from_bottom_left(matrix): for i in range(matrix.shape[0]): for j in range(matrix.shape[1]): - if (matrix[i][j] != 0) and (matrix[i][j] != np.nan): + if (matrix[i][j] != 0) and (not np.isnan(matrix[i][j])): return (i, j) return i, j @@ -34,7 +34,7 @@ def find_first_non_zero_from_bottom_left(matrix): def find_first_non_zero_from_upper_right(matrix): for i in range(matrix.shape[0] - 1, -1, -1): for j in range(matrix.shape[1] - 1, -1, -1): - if (matrix[i][j] != 0) and (matrix[i][j] != np.nan): + if (matrix[i][j] != 0) and (not np.isnan(matrix[i][j])): return (i, j) return i, j diff --git a/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb b/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb index c0cd729f50c..146b21d8ab2 100644 --- a/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb +++ b/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb @@ -15,7 +15,7 @@ my_constants.dt = 0.1/wpe # s ################################# max_step = 20 amr.n_cell = 40 40 -amr.max_grid_size = 8 +amr.max_grid_size = 16 amr.blocking_factor = 8 amr.max_level = 0 geometry.dims = 2 diff --git a/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb_filtered b/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb_filtered index c7457e02af8..99c8a1f0468 100644 --- a/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb_filtered +++ b/Examples/Tests/implicit/inputs_test_2d_theta_implicit_jfnk_vandb_filtered @@ -15,7 +15,7 @@ my_constants.dt = 0.1/wpe # s ################################# max_step = 20 amr.n_cell = 40 40 -amr.max_grid_size = 8 +amr.max_grid_size = 16 amr.blocking_factor = 8 amr.max_level = 0 geometry.dims = 2 diff --git a/Examples/Tests/langmuir/inputs_test_2d_langmuir_multi_mr_psatd b/Examples/Tests/langmuir/inputs_test_2d_langmuir_multi_mr_psatd index cf95a07e2fc..668654d8007 100644 --- a/Examples/Tests/langmuir/inputs_test_2d_langmuir_multi_mr_psatd +++ b/Examples/Tests/langmuir/inputs_test_2d_langmuir_multi_mr_psatd @@ -4,6 +4,7 @@ FILE = inputs_base_2d # test input parameters algo.maxwell_solver = psatd amr.max_level = 1 +amr.max_grid_size = 128 amr.ref_ratio = 4 diag1.electrons.variables = x z w ux uy uz diag1.positrons.variables = x z w ux uy uz diff --git a/Examples/Tests/magnetostatic_eb/inputs_test_3d_magnetostatic_eb b/Examples/Tests/magnetostatic_eb/inputs_test_3d_magnetostatic_eb index 73ff14dc8a4..45c17e7984a 100644 --- a/Examples/Tests/magnetostatic_eb/inputs_test_3d_magnetostatic_eb +++ b/Examples/Tests/magnetostatic_eb/inputs_test_3d_magnetostatic_eb @@ -29,7 +29,7 @@ algo.particle_shape = 1 algo.current_deposition = direct particles.species_names = beam -particles.do_tiling = 1 +particles.do_tiling = 0 beam.mass = m_e beam.charge = -q_e beam.injection_style = nuniformpercell diff --git a/Examples/analysis_default_regression.py b/Examples/analysis_default_regression.py index 00cc4e31536..1078c352aec 100755 --- a/Examples/analysis_default_regression.py +++ b/Examples/analysis_default_regression.py @@ -74,6 +74,7 @@ def main(args): OpenPMDTimeSeries(args.path) except Exception: print("Could not open the file as a plotfile or an openPMD time series") + args.format = "openpmd" else: args.format = "openpmd" else: diff --git a/Examples/analysis_default_restart.py b/Examples/analysis_default_restart.py index ad6bc22e60e..a12dcb1becf 100755 --- a/Examples/analysis_default_restart.py +++ b/Examples/analysis_default_restart.py @@ -49,11 +49,22 @@ def check_restart(filename, tolerance=1e-12): dims=ds_benchmark.domain_dimensions, ) - # Loop over all fields (all particle species, all particle attributes, all grid fields) - # and compare output data generated from initial run with output data generated after restart + # Separate grid fields from particle fields. Particle fields use the + # species name as field type; grid fields use 'boxlib'. + particle_species = set() + grid_fields = [] + for field in ds_benchmark.field_list: + ftype, fname = field + if ftype == "boxlib": + grid_fields.append(field) + elif ftype != "all": + particle_species.add(ftype) + print(f"\ntolerance = {tolerance}") print() - for field in ds_benchmark.field_list: + + # Compare grid fields directly (order is deterministic) + for field in grid_fields: dr = ad_restart[field].squeeze().v db = ad_benchmark[field].squeeze().v error = np.amax(np.abs(dr - db)) @@ -61,6 +72,31 @@ def check_restart(filename, tolerance=1e-12): error /= np.amax(np.abs(db)) print(f"field: {field}; error = {error}") assert error < tolerance + + # Compare particle fields sorted by (particle_cpu, particle_id), since + # Redistribute() after checkpoint-restart may reorder particles across + # tiles/ranks. The (cpu, id) pair is the unique particle key in AMReX. + for species in sorted(particle_species): + species_fields = [f for f in ds_benchmark.field_list if f[0] == species] + + id_r = np.atleast_1d(ad_restart[(species, "particle_id")].squeeze().v) + id_b = np.atleast_1d(ad_benchmark[(species, "particle_id")].squeeze().v) + cpu_r = np.atleast_1d(ad_restart[(species, "particle_cpu")].squeeze().v) + cpu_b = np.atleast_1d(ad_benchmark[(species, "particle_cpu")].squeeze().v) + + sort_r = np.lexsort((id_r, cpu_r)) + sort_b = np.lexsort((id_b, cpu_b)) + + for field in species_fields: + if field[1] in ("particle_id", "particle_cpu"): + continue + dr = np.atleast_1d(ad_restart[field].squeeze().v)[sort_r] + db = np.atleast_1d(ad_benchmark[field].squeeze().v)[sort_b] + error = np.amax(np.abs(dr - db)) + if np.amax(np.abs(db)) != 0.0: + error /= np.amax(np.abs(db)) + print(f"field: {field}; error = {error}") + assert error < tolerance print() diff --git a/Python/pywarpx/_libwarpx.py b/Python/pywarpx/_libwarpx.py index 4b033e36239..dce36d5a554 100755 --- a/Python/pywarpx/_libwarpx.py +++ b/Python/pywarpx/_libwarpx.py @@ -71,6 +71,17 @@ def load_library(self): "Please write separate scripts for each geometry." ) + # Import mpi4py before the pyAMReX (amrex.space*d) shared library is + # loaded so that mpi4py calls MPI_Init_thread first. With the Cray + # MPICH on Polaris/Sirius, if the AMReX shared library is loaded before + # mpi4py initializes MPI, mpi4py's later MPI_Init_thread conflicts with + # the already-loaded MPI symbols and causes hangs or unbounded memory + # allocation during amrex::Initialize(). + try: + from mpi4py import MPI # noqa: F811,F401 + except ImportError: + pass # mpi4py is optional; MPI_Init handled by AMReX if absent + # --- Use geometry to determine whether to import the 1D, 2D, 3D or RZ version. # --- The geometry must be setup before the lib warpx shared object can be loaded. try: @@ -147,7 +158,7 @@ def load_library(self): register_warpx_WarpXParticleContainer_extension(self.libwarpx_so) def amrex_init(self, argv, mpi_comm=None): - if mpi_comm is None: # or MPI is None: + if mpi_comm is None: self.libwarpx_so.amrex_init(argv) else: raise Exception("mpi_comm argument not yet supported") @@ -172,9 +183,28 @@ def finalize(self, finalize_mpi=1): """ # TODO: simplify, part of pyAMReX already if self.initialized: + # GPU finalization workaround: on GPU backends, destroying the C++ + # WarpX object (del self.warpx) or calling amrex_finalize() can + # crash in SYCL/CUDA/HIP Device::Finalize() when it tries to + # synchronize streams whose backing static objects are already gone. + # Exit immediately via os._exit(0) BEFORE any C++ destructors run, + # and let the job launcher (PALS/mpiexec) handle cleanup. + try: + if self.libwarpx_so.Config.gpu_backend in ("SYCL", "CUDA", "HIP"): + try: + from mpi4py import MPI + + MPI.COMM_WORLD.Barrier() + except Exception: + pass # best-effort barrier; proceeding to _exit + os._exit(0) + except Exception: + pass # Config may not be available; fall through to normal cleanup + del self.warpx # The call to warpx_finalize causes a crash - don't know why # self.libwarpx_so.warpx_finalize() + self.libwarpx_so.amrex_finalize() from pywarpx import callbacks diff --git a/Source/Diagnostics/WarpXOpenPMD.cpp b/Source/Diagnostics/WarpXOpenPMD.cpp index 06b446f6158..0634cfc89b0 100644 --- a/Source/Diagnostics/WarpXOpenPMD.cpp +++ b/Source/Diagnostics/WarpXOpenPMD.cpp @@ -593,6 +593,10 @@ for (const auto & particle_diag : particle_diags) { particlesConvertUnits(ConvertDirection::SI_to_WarpX, pc, mass); } + // On GPU backends, copyParticles may be asynchronous; synchronize before + // passing pinned-memory pointers to openPMD's storeChunkRaw. + amrex::Gpu::streamSynchronize(); + // Gather the electrostatic potential (phi) on the macroparticles if ( particle_diag.m_plot_phi ) { storePhiOnParticles( tmp, WarpX::electrostatic_solver_id, !use_pinned_pc ); diff --git a/Source/Particles/ParticleCreation/DefaultInitialization.H b/Source/Particles/ParticleCreation/DefaultInitialization.H index 76b40cf5252..81dac7295ef 100644 --- a/Source/Particles/ParticleCreation/DefaultInitialization.H +++ b/Source/Particles/ParticleCreation/DefaultInitialization.H @@ -14,6 +14,7 @@ # include "Particles/ElementaryProcess/QEDInternals/QuantumSyncEngineWrapper.H" #endif +#include #include #include @@ -21,6 +22,23 @@ #include #include +namespace ParticleCreation { +/** Whether particle data with allocator Alloc lives on the GPU. + * amrex::RunOnGpu is not specialised for PolymorphicArenaAllocator, + * so we extend the check here. PolymorphicArenaAllocator in WarpX + * always wraps a device arena, so it is safe to treat it as GPU. */ +template +constexpr bool particles_on_gpu () +{ +#ifdef AMREX_USE_GPU + return amrex::RunOnGpu::value + || amrex::IsPolymorphicArenaAllocator::value; +#else + return false; +#endif +} +} // namespace ParticleCreation + /** * \brief This set of initialization policies describes what happens * when we need to create a new particle due to an elementary process. @@ -158,7 +176,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile, const QuantumSynchrotronGetOpticalDepth quantum_sync_get_opt = p_qs_engine->build_optical_depth_functor(); // If the particle tile was allocated in a memory pool that can run on GPU, launch GPU kernel - if constexpr (amrex::RunOnGpu>::value) { + if constexpr (ParticleCreation::particles_on_gpu>()) { amrex::ParallelForRNG(stop - start, [=] AMREX_GPU_DEVICE (int i, amrex::RandomEngine const& engine) noexcept { const int ip = i + start; @@ -185,7 +203,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile, const BreitWheelerGetOpticalDepth breit_wheeler_get_opt = p_bw_engine->build_optical_depth_functor();; // If the particle tile was allocated in a memory pool that can run on GPU, launch GPU kernel - if constexpr (amrex::RunOnGpu>::value) { + if constexpr (ParticleCreation::particles_on_gpu>()) { amrex::ParallelForRNG(stop - start, [=] AMREX_GPU_DEVICE (int i, amrex::RandomEngine const& engine) noexcept { const int ip = i + start; @@ -214,7 +232,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile, const amrex::ParserExecutor<7> user_real_attrib_parserexec = user_real_attrib_parser[ia]->compile<7>(); // If the particle tile was allocated in a memory pool that can run on GPU, launch GPU kernel - if constexpr (amrex::RunOnGpu>::value) { + if constexpr (ParticleCreation::particles_on_gpu>()) { amrex::ParallelFor(stop - start, [=] AMREX_GPU_DEVICE (int i) noexcept { const int ip = i + start; @@ -246,7 +264,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile, if (it_ioniz != particle_icomps.end() && std::distance(particle_icomps.begin(), it_ioniz) == j) { - if constexpr (amrex::RunOnGpu>::value) { + if constexpr (ParticleCreation::particles_on_gpu>()) { amrex::ParallelFor(stop - start, [=] AMREX_GPU_DEVICE (int i) noexcept { const int ip = i + start; @@ -268,7 +286,7 @@ void DefaultInitializeRuntimeAttributes (PTile& ptile, { const amrex::ParserExecutor<7> user_int_attrib_parserexec = user_int_attrib_parser[ia]->compile<7>(); - if constexpr (amrex::RunOnGpu>::value) { + if constexpr (ParticleCreation::particles_on_gpu>()) { amrex::ParallelFor(stop - start, [=] AMREX_GPU_DEVICE (int i) noexcept { const int ip = i + start;