Skip to content

Commit

Permalink
add check for equal size for workspace and tile_t, and related doc
Browse files Browse the repository at this point in the history
  • Loading branch information
albestro committed Jan 21, 2025
1 parent a22f5a0 commit 0d2bd4f
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 36 deletions.
4 changes: 2 additions & 2 deletions include/dlaf/factorization/qr.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ namespace dlaf::factorization::internal {
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
/// get_tfactor_nworkers)
/// get_tfactor_nworkers), each tile should have the same size as @param tile_t
///
/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
Expand Down Expand Up @@ -99,7 +99,7 @@ void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
/// get_tfactor_nworkers)
/// get_tfactor_nworkers), each tile should have the same size as @param tile_t
/// @param mpi_col_task_chain where internal communications are issued
///
/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
Expand Down
71 changes: 37 additions & 34 deletions include/dlaf/factorization/qr/t_factor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,39 +132,40 @@ struct Helpers<Backend::MC, Device::CPU, T> {

return ex::just(std::make_unique<pika::barrier<>>(nworkers)) |
di::continues_on(hp_scheduler) |
ex::bulk(nworkers,
[=, &hh_tiles, &taus, &tile_t, &workspaces](const std::size_t worker_id,
auto& barrier_ptr) mutable {
const SizeType k = taus.get().size().rows();

const std::size_t begin = worker_id * batch_size;
const std::size_t end =
std::min(worker_id * batch_size + batch_size, hh_tiles.size());

const matrix::Tile<T, Device::CPU>& ws_worker =
worker_id == 0 ? tile_t : workspaces[worker_id - 1];

tile::internal::set0<T>(ws_worker);
lapack::lacpy(blas::Uplo::General, 1, k, taus.get().ptr(), 1,
ws_worker.ptr(), ws_worker.ld() + 1);

// make it work on worker_id section of tiles
for (std::size_t index = begin; index < end; ++index) {
const matrix::Tile<const T, Device::CPU>& tile_v =
hh_tiles[index].get();
loop_gemv(tile_v, taus.get(), ws_worker);
}

barrier_ptr->arrive_and_wait(barrier_busy_wait);

// reduce ws_T in tile_t
if (worker_id == 0) {
for (std::size_t other_worker = 1; other_worker < nworkers;
++other_worker) {
tile::internal::add(T(1), workspaces[other_worker - 1], tile_t);
}
}
}) |
ex::bulk(
nworkers,
[=, &hh_tiles, &taus, &tile_t, &workspaces](const std::size_t worker_id,
auto& barrier_ptr) mutable {
const SizeType k = taus.get().size().rows();

const std::size_t begin = worker_id * batch_size;
const std::size_t end =
std::min(worker_id * batch_size + batch_size, hh_tiles.size());

const matrix::Tile<T, Device::CPU>& ws_worker =
worker_id == 0 ? tile_t : workspaces[worker_id - 1];

DLAF_ASSERT(equal_size(ws_worker, tile_t), ws_worker.size(), tile_t.size());

tile::internal::set0<T>(ws_worker);
lapack::lacpy(blas::Uplo::General, 1, k, taus.get().ptr(), 1, ws_worker.ptr(),
ws_worker.ld() + 1);

// make it work on worker_id section of tiles
for (std::size_t index = begin; index < end; ++index) {
const matrix::Tile<const T, Device::CPU>& tile_v = hh_tiles[index].get();
loop_gemv(tile_v, taus.get(), ws_worker);
}

barrier_ptr->arrive_and_wait(barrier_busy_wait);

// reduce ws_T in tile_t
if (worker_id == 0) {
for (std::size_t other_worker = 1; other_worker < nworkers; ++other_worker) {
tile::internal::add(T(1), workspaces[other_worker - 1], tile_t);
}
}
}) |
// Note: drop the barrier sent by the bulk and return tile_t
ex::then([&tile_t](auto&&) mutable { return std::move(tile_t); });
});
Expand Down Expand Up @@ -271,7 +272,9 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
di::Policy<Backend::GPU>(thread_priority::high),
[](cublasHandle_t handle, auto&& hh_tiles, auto&& taus,
matrix::Tile<T, Device::GPU>& tile_t) {
const SizeType k = tile_t.size().cols();
const SizeType k = taus.size().rows();

DLAF_ASSERT(tile_t.size() == TileElementSize(k, k), tile_t.size(), k);

// Note:
// prepare the diagonal of taus in t and reset the rest
Expand Down

0 comments on commit 0d2bd4f

Please sign in to comment.