[enhancement] Augmentation operator on GPUs (#803)

Modify the code to run large plane-wave cutoffs. In this case even local number of G-vectors has to be split in chunks such that augmentation operator can fit in the GPU memory. * local G-vector chunk size is controlled with input parameter * augmentation operator was simplified; prepare() and dismiss() functions were removed * augmentation operator derivative class was incorporated in the Augmentation_operator class With this modifications it was possible to run Au-surf example on 400 nodes of Piz Daint (pw cutoff: 60 a.u.^-1, gk cutoff 25 a.u.^-1)
electronic-structure · Jan 30, 2023 · d371df2 · d371df2
1 parent f5393f9
commit d371df2
Show file tree

Hide file tree

Showing 37 changed files with 766 additions and 864 deletions.
diff --git a/apps/tests/test_wf_fft.cpp b/apps/tests/test_wf_fft.cpp
@@ -7,7 +7,7 @@ using namespace sirius;
 void test_wf_fft()
 {
     //sddk::MPI_grid mpi_grid({2, 3}, sddk::Communicator::world());
-    mpi::Grid mpi_grid({1, 1}, mpi::Communicator::world());
+    mpi::Grid mpi_grid({2, 2}, mpi::Communicator::world());
 
     /* creation of simple G+k vector set */
     auto gkvec = fft::gkvec_factory(8.0, mpi_grid.communicator());
@@ -18,7 +18,8 @@ void test_wf_fft()
     /* get the FFT box boundaries */
     auto fft_grid = fft::get_min_grid(8.0, gkvec->lattice_vectors());
 
-    std::vector<int> num_mt_coeffs({10, 20, 30, 10, 20});
+    //std::vector<int> num_mt_coeffs({10, 20, 30, 10, 20});
+    std::vector<int> num_mt_coeffs({1});
 
     wf::Wave_functions<double> wf(gkvec, num_mt_coeffs, wf::num_mag_dims(1), wf::num_bands(10), sddk::memory_t::host);
     wf::Wave_functions<double> wf_ref(gkvec, num_mt_coeffs, wf::num_mag_dims(1), wf::num_bands(10), sddk::memory_t::host);
@@ -32,7 +33,7 @@ void test_wf_fft()
             }
         }
     }
-    auto mg = wf.memory_guard(sddk::memory_t::device, wf::copy_to::device);
+    //auto mg = wf.memory_guard(sddk::memory_t::device, wf::copy_to::device);
 
     auto pu = sddk::device_t::CPU;
 
@@ -61,10 +62,10 @@ void test_wf_fft()
     for (int ispn = 0; ispn < 2; ispn++) {
 
         wf::Wave_functions_fft<double> wf_fft(gkvec_fft, wf, wf::spin_index(ispn), wf::band_range(0,10),
-                wf::shuffle_to::fft_layout | wf::shuffle_to::wf_layout);
+                wf::shuffle_to::wf_layout);
 
         for (int i = 0; i < wf_fft.num_wf_local(); i++) {
-            spfft_transform->backward(wf_fft.pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), spfft_pu);
+            spfft_transform->backward(wf1[ispn].pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), spfft_pu);
             spfft_transform->forward(spfft_pu, wf_fft.pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), SPFFT_FULL_SCALING);
         }
     }

diff --git a/apps/unit_tests/test_rlm_deriv.cpp b/apps/unit_tests/test_rlm_deriv.cpp
@@ -482,7 +482,7 @@ int test3()
                            1 + 10 * utils::random<double>(),
                            1 + 10 * utils::random<double>());
 
-        auto rtp = SHT::spherical_coordinates(v);
+        auto rtp = r3::spherical_coordinates(v);
 
         double dr = 1e-5 * rtp[0];
 
@@ -495,8 +495,8 @@ int test3()
             r3::vector<double> v2 = v;
             v2[x] -= dr;
 
-            auto vs1 = SHT::spherical_coordinates(v1);
-            auto vs2 = SHT::spherical_coordinates(v2);
+            auto vs1 = r3::spherical_coordinates(v1);
+            auto vs2 = r3::spherical_coordinates(v2);
             std::vector<double> rlm1(lmmax);
             std::vector<double> rlm2(lmmax);
 

diff --git a/apps/unit_tests/test_rot_ylm.cpp b/apps/unit_tests/test_rot_ylm.cpp
@@ -44,10 +44,10 @@ int run_test_impl(cmd_args& args)
 
             /* random Cartesian vector */
             r3::vector<double> coord(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX, double(rand()) / RAND_MAX);
-            auto scoord = SHT::spherical_coordinates(coord);
+            auto scoord = r3::spherical_coordinates(coord);
             /* rotated coordinates */
             auto coord2 = dot(Rc, coord);
-            auto scoord2 = SHT::spherical_coordinates(coord2);
+            auto scoord2 = r3::spherical_coordinates(coord2);
 
             int lmax{10};
             sddk::mdarray<T, 1> ylm(utils::lmmax(lmax));

diff --git a/src/SDDK/memory.hpp b/src/SDDK/memory.hpp
@@ -678,7 +678,7 @@ sddk::memory_pool& get_memory_pool(sddk::memory_t M__);
     {                                                                             \
         if (!(condition__)) {                                                     \
             std::stringstream _s;                                                 \
-            _s << "Assertion (" <<  #condition__ << ") failed "                   \
+            _s << "Assertion (" << #condition__ << ") failed "                    \
                << "at line " << __LINE__ << " of file " << __FILE__ << std::endl  \
                << "array label: " << label_ << std::endl;                         \
             for (int i = 0; i < N; i++) {                                         \
@@ -755,6 +755,20 @@ class mdarray_index_descriptor
     {
         return size_;
     }
+
+    inline bool check_range(index_type i__) const
+    {
+#ifdef NDEBUG
+        return true;
+#else
+        if (i__ < begin_ || i__ > end_) {
+            std::cout << "index " << i__ << " out of range [" << begin_ << ", " << end_ << "]" << std::endl;
+            return false;
+        } else {
+            return true;
+        }
+#endif
+    }
 };
 
 /// Multidimensional array with the column-major (Fortran) order.
@@ -833,7 +847,8 @@ class mdarray
         std::array<index_type, N> i = {args...};
 
         for (int j = 0; j < N; j++) {
-            mdarray_assert(i[j] >= dims_[j].begin() && i[j] <= dims_[j].end());
+            //mdarray_assert(dims_[j].check_range(i[j]) && i[j] >= dims_[j].begin() && i[j] <= dims_[j].end());
+            mdarray_assert(dims_[j].check_range(i[j]));
         }
 
         size_t idx = offsets_[0] + i[0];

diff --git a/src/api/sirius_api.cpp b/src/api/sirius_api.cpp
@@ -2768,7 +2768,7 @@ sirius_find_eigen_states(void* const* gs_handler__, void* const* ks_handler__, b
                 gs.potential().update_atomic_potential();
             }
             if (precompute_rf__ && *precompute_rf__) {
-                const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_functions();
+                const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_functions(gs.ctx().out());
             }
             if (precompute_ri__ && *precompute_ri__) {
                 const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_integrals();
@@ -4033,7 +4033,7 @@ sirius_get_gkvec_arrays(void* const* ks_handler__, int* ik__, int* num_gkvec__,
                         gkvec(x, igk)      = kp->gkvec().template gkvec<sddk::index_domain_t::global>(igk)[x];
                         gkvec_cart(x, igk) = gkc[x];
                     }
-                    auto rtp         = sirius::SHT::spherical_coordinates(gkc);
+                    auto rtp         = r3::spherical_coordinates(gkc);
                     gkvec_len[igk]   = rtp[0];
                     gkvec_tp(0, igk) = rtp[1];
                     gkvec_tp(1, igk) = rtp[2];

diff --git a/src/band/diag_full_potential.cpp b/src/band/diag_full_potential.cpp
@@ -46,6 +46,8 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
     /* block size of scalapack 2d block-cyclic distribution */
     int bs = ctx_.cyclic_block_size();
 
+    auto pcs = env::print_checksum();
+
     la::dmatrix<std::complex<double>> h(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, get_memory_pool(solver.host_memory_t()));
     la::dmatrix<std::complex<double>> o(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, get_memory_pool(solver.host_memory_t()));
 
@@ -76,7 +78,7 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
         }
     }
 
-    if (ctx_.print_checksum()) {
+    if (pcs) {
         auto z1 = h.checksum(ngklo, ngklo);
         auto z2 = o.checksum(ngklo, ngklo);
         utils::print_checksum("h_lapw", z1, ctx_.out());
@@ -107,11 +109,9 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
         }
     }
 
-    if (ctx_.print_checksum()) {
+    if (pcs) {
         auto z1 = kp.fv_eigen_vectors().checksum(kp.gklo_basis_size(), ctx_.num_fv_states());
-        if (kp.comm().rank() == 0) {
-            utils::print_checksum("fv_eigen_vectors", z1, kp.out(1));
-        }
+        utils::print_checksum("fv_eigen_vectors", z1, kp.out(1));
     }
 
     /* remap to slab */
@@ -129,6 +129,12 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
             la::constant<std::complex<double>>::zero(), kp.comm().native());
     }
 
+    if (pcs) {
+        auto z1 = kp.fv_eigen_vectors_slab().checksum_pw(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states()));
+        auto z2 = kp.fv_eigen_vectors_slab().checksum_mt(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states()));
+        utils::print_checksum("fv_eigen_vectors_slab", z1 + z2, kp.out(1));
+    }
+
     /* renormalize wave-functions */
     if (ctx_.valence_relativity() == relativity_t::iora) {
 

diff --git a/src/beta_projectors/beta_projectors.hpp b/src/beta_projectors/beta_projectors.hpp
@@ -57,7 +57,7 @@ class Beta_projectors : public Beta_projectors_base<T>
         #pragma omp parallel for
         for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
             /* vs = {r, theta, phi} */
-            auto vs = SHT::spherical_coordinates(this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc));
+            auto vs = r3::spherical_coordinates(this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc));
             /* compute real spherical harmonics for G+k vector */
             std::vector<double> gkvec_rlm(utils::lmmax(this->ctx_.unit_cell().lmax()));
             sf::spherical_harmonics(this->ctx_.unit_cell().lmax(), vs[1], vs[2], &gkvec_rlm[0]);

diff --git a/src/beta_projectors/beta_projectors_strain_deriv.hpp b/src/beta_projectors/beta_projectors_strain_deriv.hpp
@@ -55,7 +55,7 @@ class Beta_projectors_strain_deriv : public Beta_projectors_base<T>
         #pragma omp parallel for schedule(static)
         for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
             auto gvc = this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc);
-            auto rtp = SHT::spherical_coordinates(gvc);
+            auto rtp = r3::spherical_coordinates(gvc);
 
             double theta = rtp[1];
             double phi   = rtp[2];
@@ -70,7 +70,7 @@ class Beta_projectors_strain_deriv : public Beta_projectors_base<T>
         for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
             auto gvc = this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc);
             /* vs = {r, theta, phi} */
-            auto gvs = SHT::spherical_coordinates(gvc);
+            auto gvs = r3::spherical_coordinates(gvc);
 
             /* |G+k|=0 case */
             if (gvs[0] < 1e-10) {

diff --git a/src/context/config.hpp b/src/context/config.hpp
@@ -943,6 +943,18 @@ class config_t
             }
             dict_["/control/output"_json_pointer] = output__;
         }
+        /// Split local G-vectors in chunks to reduce the GPU memory consumption of augmentation operator.
+        inline auto gvec_chunk_size() const
+        {
+            return dict_.at("/control/gvec_chunk_size"_json_pointer).get<int>();
+        }
+        inline void gvec_chunk_size(int gvec_chunk_size__)
+        {
+            if (dict_.contains("locked")) {
+                throw std::runtime_error(locked_msg);
+            }
+            dict_["/control/gvec_chunk_size"_json_pointer] = gvec_chunk_size__;
+        }
       private:
         nlohmann::json& dict_;
     };

diff --git a/src/context/input_schema.json b/src/context/input_schema.json
@@ -411,36 +411,41 @@
                     "type" : "boolean",
                     "default" : false,
                     "title" : "If true then the atomic forces are printed at the end of SCF run."
-                 },
-                 "print_timers" : {
-                     "type" : "boolean",
-                     "default" : true,
-                     "title" : "If true then the timer statistics is printed at the end of SCF run."
-                 },
-                 "print_neighbors" : {
-                     "type" : "boolean",
-                     "default" : false,
-                     "title" : "If true then the list of nearest neighbours for each atom is printed to the standard output."
-                 },
-                 "use_second_variation" : {
-                     "type" : "boolean",
-                     "default" : true,
-                     "title" : "True if second-variational diagonalization is used in LAPW method."
-                 },
-                 "beta_chunk_size" : {
-                     "type" : "integer",
-                     "default" : 256,
-                     "title" : "Number of atoms in a chunk of beta-projectors."
-                 },
-                 "ortho_rf" : {
-                     "type" : "boolean",
-                     "default" : false,
-                     "title" : "Orthogonalize LAPW radial functions."
-                 },
-                 "output" : {
-                     "type" : "string",
-                     "default" : "stdout:",
-                     "title" : "Type of the output stream (stdout:, file:name)"
+                },
+                "print_timers" : {
+                    "type" : "boolean",
+                    "default" : true,
+                    "title" : "If true then the timer statistics is printed at the end of SCF run."
+                },
+                "print_neighbors" : {
+                    "type" : "boolean",
+                    "default" : false,
+                    "title" : "If true then the list of nearest neighbours for each atom is printed to the standard output."
+                },
+                "use_second_variation" : {
+                    "type" : "boolean",
+                    "default" : true,
+                    "title" : "True if second-variational diagonalization is used in LAPW method."
+                },
+                "beta_chunk_size" : {
+                    "type" : "integer",
+                    "default" : 256,
+                    "title" : "Number of atoms in a chunk of beta-projectors."
+                },
+                "ortho_rf" : {
+                    "type" : "boolean",
+                    "default" : false,
+                    "title" : "Orthogonalize LAPW radial functions."
+                },
+                "output" : {
+                    "type" : "string",
+                    "default" : "stdout:",
+                    "title" : "Type of the output stream (stdout:, file:name)"
+                },
+                "gvec_chunk_size" : {
+                    "type" : "integer",
+                    "default" : 500000,
+                    "title" : "Split local G-vectors in chunks to reduce the GPU memory consumption of augmentation operator."
                 }
             }
         },