Skip to content

Commit

Permalink
[enhancement] Augmentation operator on GPUs (#803)
Browse files Browse the repository at this point in the history
Modify the code to run large plane-wave cutoffs. In this case even local number of G-vectors has to be split in chunks such that augmentation operator can fit in the GPU memory.

* local G-vector chunk size is controlled with input parameter
* augmentation operator was simplified; prepare() and dismiss() functions were removed
* augmentation operator derivative class was incorporated in the Augmentation_operator class

With this modifications it was possible to run Au-surf example on 400 nodes of Piz Daint (pw cutoff: 60 a.u.^-1, gk cutoff 25 a.u.^-1)
  • Loading branch information
toxa81 authored Jan 30, 2023
1 parent f5393f9 commit d371df2
Show file tree
Hide file tree
Showing 37 changed files with 766 additions and 864 deletions.
11 changes: 6 additions & 5 deletions apps/tests/test_wf_fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using namespace sirius;
void test_wf_fft()
{
//sddk::MPI_grid mpi_grid({2, 3}, sddk::Communicator::world());
mpi::Grid mpi_grid({1, 1}, mpi::Communicator::world());
mpi::Grid mpi_grid({2, 2}, mpi::Communicator::world());

/* creation of simple G+k vector set */
auto gkvec = fft::gkvec_factory(8.0, mpi_grid.communicator());
Expand All @@ -18,7 +18,8 @@ void test_wf_fft()
/* get the FFT box boundaries */
auto fft_grid = fft::get_min_grid(8.0, gkvec->lattice_vectors());

std::vector<int> num_mt_coeffs({10, 20, 30, 10, 20});
//std::vector<int> num_mt_coeffs({10, 20, 30, 10, 20});
std::vector<int> num_mt_coeffs({1});

wf::Wave_functions<double> wf(gkvec, num_mt_coeffs, wf::num_mag_dims(1), wf::num_bands(10), sddk::memory_t::host);
wf::Wave_functions<double> wf_ref(gkvec, num_mt_coeffs, wf::num_mag_dims(1), wf::num_bands(10), sddk::memory_t::host);
Expand All @@ -32,7 +33,7 @@ void test_wf_fft()
}
}
}
auto mg = wf.memory_guard(sddk::memory_t::device, wf::copy_to::device);
//auto mg = wf.memory_guard(sddk::memory_t::device, wf::copy_to::device);

auto pu = sddk::device_t::CPU;

Expand Down Expand Up @@ -61,10 +62,10 @@ void test_wf_fft()
for (int ispn = 0; ispn < 2; ispn++) {

wf::Wave_functions_fft<double> wf_fft(gkvec_fft, wf, wf::spin_index(ispn), wf::band_range(0,10),
wf::shuffle_to::fft_layout | wf::shuffle_to::wf_layout);
wf::shuffle_to::wf_layout);

for (int i = 0; i < wf_fft.num_wf_local(); i++) {
spfft_transform->backward(wf_fft.pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), spfft_pu);
spfft_transform->backward(wf1[ispn].pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), spfft_pu);
spfft_transform->forward(spfft_pu, wf_fft.pw_coeffs_spfft(sddk::memory_t::host, wf::band_index(i)), SPFFT_FULL_SCALING);
}
}
Expand Down
6 changes: 3 additions & 3 deletions apps/unit_tests/test_rlm_deriv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ int test3()
1 + 10 * utils::random<double>(),
1 + 10 * utils::random<double>());

auto rtp = SHT::spherical_coordinates(v);
auto rtp = r3::spherical_coordinates(v);

double dr = 1e-5 * rtp[0];

Expand All @@ -495,8 +495,8 @@ int test3()
r3::vector<double> v2 = v;
v2[x] -= dr;

auto vs1 = SHT::spherical_coordinates(v1);
auto vs2 = SHT::spherical_coordinates(v2);
auto vs1 = r3::spherical_coordinates(v1);
auto vs2 = r3::spherical_coordinates(v2);
std::vector<double> rlm1(lmmax);
std::vector<double> rlm2(lmmax);

Expand Down
4 changes: 2 additions & 2 deletions apps/unit_tests/test_rot_ylm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ int run_test_impl(cmd_args& args)

/* random Cartesian vector */
r3::vector<double> coord(double(rand()) / RAND_MAX, double(rand()) / RAND_MAX, double(rand()) / RAND_MAX);
auto scoord = SHT::spherical_coordinates(coord);
auto scoord = r3::spherical_coordinates(coord);
/* rotated coordinates */
auto coord2 = dot(Rc, coord);
auto scoord2 = SHT::spherical_coordinates(coord2);
auto scoord2 = r3::spherical_coordinates(coord2);

int lmax{10};
sddk::mdarray<T, 1> ylm(utils::lmmax(lmax));
Expand Down
19 changes: 17 additions & 2 deletions src/SDDK/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ sddk::memory_pool& get_memory_pool(sddk::memory_t M__);
{ \
if (!(condition__)) { \
std::stringstream _s; \
_s << "Assertion (" << #condition__ << ") failed " \
_s << "Assertion (" << #condition__ << ") failed " \
<< "at line " << __LINE__ << " of file " << __FILE__ << std::endl \
<< "array label: " << label_ << std::endl; \
for (int i = 0; i < N; i++) { \
Expand Down Expand Up @@ -755,6 +755,20 @@ class mdarray_index_descriptor
{
return size_;
}

inline bool check_range(index_type i__) const
{
#ifdef NDEBUG
return true;
#else
if (i__ < begin_ || i__ > end_) {
std::cout << "index " << i__ << " out of range [" << begin_ << ", " << end_ << "]" << std::endl;
return false;
} else {
return true;
}
#endif
}
};

/// Multidimensional array with the column-major (Fortran) order.
Expand Down Expand Up @@ -833,7 +847,8 @@ class mdarray
std::array<index_type, N> i = {args...};

for (int j = 0; j < N; j++) {
mdarray_assert(i[j] >= dims_[j].begin() && i[j] <= dims_[j].end());
//mdarray_assert(dims_[j].check_range(i[j]) && i[j] >= dims_[j].begin() && i[j] <= dims_[j].end());
mdarray_assert(dims_[j].check_range(i[j]));
}

size_t idx = offsets_[0] + i[0];
Expand Down
4 changes: 2 additions & 2 deletions src/api/sirius_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2768,7 +2768,7 @@ sirius_find_eigen_states(void* const* gs_handler__, void* const* ks_handler__, b
gs.potential().update_atomic_potential();
}
if (precompute_rf__ && *precompute_rf__) {
const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_functions();
const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_functions(gs.ctx().out());
}
if (precompute_ri__ && *precompute_ri__) {
const_cast<sirius::Unit_cell&>(gs.ctx().unit_cell()).generate_radial_integrals();
Expand Down Expand Up @@ -4033,7 +4033,7 @@ sirius_get_gkvec_arrays(void* const* ks_handler__, int* ik__, int* num_gkvec__,
gkvec(x, igk) = kp->gkvec().template gkvec<sddk::index_domain_t::global>(igk)[x];
gkvec_cart(x, igk) = gkc[x];
}
auto rtp = sirius::SHT::spherical_coordinates(gkc);
auto rtp = r3::spherical_coordinates(gkc);
gkvec_len[igk] = rtp[0];
gkvec_tp(0, igk) = rtp[1];
gkvec_tp(1, igk) = rtp[2];
Expand Down
16 changes: 11 additions & 5 deletions src/band/diag_full_potential.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
/* block size of scalapack 2d block-cyclic distribution */
int bs = ctx_.cyclic_block_size();

auto pcs = env::print_checksum();

la::dmatrix<std::complex<double>> h(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, get_memory_pool(solver.host_memory_t()));
la::dmatrix<std::complex<double>> o(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, get_memory_pool(solver.host_memory_t()));

Expand Down Expand Up @@ -76,7 +78,7 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
}
}

if (ctx_.print_checksum()) {
if (pcs) {
auto z1 = h.checksum(ngklo, ngklo);
auto z2 = o.checksum(ngklo, ngklo);
utils::print_checksum("h_lapw", z1, ctx_.out());
Expand Down Expand Up @@ -107,11 +109,9 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
}
}

if (ctx_.print_checksum()) {
if (pcs) {
auto z1 = kp.fv_eigen_vectors().checksum(kp.gklo_basis_size(), ctx_.num_fv_states());
if (kp.comm().rank() == 0) {
utils::print_checksum("fv_eigen_vectors", z1, kp.out(1));
}
utils::print_checksum("fv_eigen_vectors", z1, kp.out(1));
}

/* remap to slab */
Expand All @@ -129,6 +129,12 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k<double>& Hk__) con
la::constant<std::complex<double>>::zero(), kp.comm().native());
}

if (pcs) {
auto z1 = kp.fv_eigen_vectors_slab().checksum_pw(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states()));
auto z2 = kp.fv_eigen_vectors_slab().checksum_mt(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, ctx_.num_fv_states()));
utils::print_checksum("fv_eigen_vectors_slab", z1 + z2, kp.out(1));
}

/* renormalize wave-functions */
if (ctx_.valence_relativity() == relativity_t::iora) {

Expand Down
2 changes: 1 addition & 1 deletion src/beta_projectors/beta_projectors.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class Beta_projectors : public Beta_projectors_base<T>
#pragma omp parallel for
for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
/* vs = {r, theta, phi} */
auto vs = SHT::spherical_coordinates(this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc));
auto vs = r3::spherical_coordinates(this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc));
/* compute real spherical harmonics for G+k vector */
std::vector<double> gkvec_rlm(utils::lmmax(this->ctx_.unit_cell().lmax()));
sf::spherical_harmonics(this->ctx_.unit_cell().lmax(), vs[1], vs[2], &gkvec_rlm[0]);
Expand Down
4 changes: 2 additions & 2 deletions src/beta_projectors/beta_projectors_strain_deriv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class Beta_projectors_strain_deriv : public Beta_projectors_base<T>
#pragma omp parallel for schedule(static)
for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
auto gvc = this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc);
auto rtp = SHT::spherical_coordinates(gvc);
auto rtp = r3::spherical_coordinates(gvc);

double theta = rtp[1];
double phi = rtp[2];
Expand All @@ -70,7 +70,7 @@ class Beta_projectors_strain_deriv : public Beta_projectors_base<T>
for (int igkloc = 0; igkloc < this->num_gkvec_loc(); igkloc++) {
auto gvc = this->gkvec_.template gkvec_cart<sddk::index_domain_t::local>(igkloc);
/* vs = {r, theta, phi} */
auto gvs = SHT::spherical_coordinates(gvc);
auto gvs = r3::spherical_coordinates(gvc);

/* |G+k|=0 case */
if (gvs[0] < 1e-10) {
Expand Down
12 changes: 12 additions & 0 deletions src/context/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,18 @@ class config_t
}
dict_["/control/output"_json_pointer] = output__;
}
/// Split local G-vectors in chunks to reduce the GPU memory consumption of augmentation operator.
inline auto gvec_chunk_size() const
{
return dict_.at("/control/gvec_chunk_size"_json_pointer).get<int>();
}
inline void gvec_chunk_size(int gvec_chunk_size__)
{
if (dict_.contains("locked")) {
throw std::runtime_error(locked_msg);
}
dict_["/control/gvec_chunk_size"_json_pointer] = gvec_chunk_size__;
}
private:
nlohmann::json& dict_;
};
Expand Down
65 changes: 35 additions & 30 deletions src/context/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -411,36 +411,41 @@
"type" : "boolean",
"default" : false,
"title" : "If true then the atomic forces are printed at the end of SCF run."
},
"print_timers" : {
"type" : "boolean",
"default" : true,
"title" : "If true then the timer statistics is printed at the end of SCF run."
},
"print_neighbors" : {
"type" : "boolean",
"default" : false,
"title" : "If true then the list of nearest neighbours for each atom is printed to the standard output."
},
"use_second_variation" : {
"type" : "boolean",
"default" : true,
"title" : "True if second-variational diagonalization is used in LAPW method."
},
"beta_chunk_size" : {
"type" : "integer",
"default" : 256,
"title" : "Number of atoms in a chunk of beta-projectors."
},
"ortho_rf" : {
"type" : "boolean",
"default" : false,
"title" : "Orthogonalize LAPW radial functions."
},
"output" : {
"type" : "string",
"default" : "stdout:",
"title" : "Type of the output stream (stdout:, file:name)"
},
"print_timers" : {
"type" : "boolean",
"default" : true,
"title" : "If true then the timer statistics is printed at the end of SCF run."
},
"print_neighbors" : {
"type" : "boolean",
"default" : false,
"title" : "If true then the list of nearest neighbours for each atom is printed to the standard output."
},
"use_second_variation" : {
"type" : "boolean",
"default" : true,
"title" : "True if second-variational diagonalization is used in LAPW method."
},
"beta_chunk_size" : {
"type" : "integer",
"default" : 256,
"title" : "Number of atoms in a chunk of beta-projectors."
},
"ortho_rf" : {
"type" : "boolean",
"default" : false,
"title" : "Orthogonalize LAPW radial functions."
},
"output" : {
"type" : "string",
"default" : "stdout:",
"title" : "Type of the output stream (stdout:, file:name)"
},
"gvec_chunk_size" : {
"type" : "integer",
"default" : 500000,
"title" : "Split local G-vectors in chunks to reduce the GPU memory consumption of augmentation operator."
}
}
},
Expand Down
Loading

0 comments on commit d371df2

Please sign in to comment.