add check for equal size for workspace and tile_t, and related doc

eth-cscs · Jan 21, 2025 · 0d2bd4f · 0d2bd4f
1 parent a22f5a0
commit 0d2bd4f
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 36 deletions.
diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
@@ -54,7 +54,7 @@ namespace dlaf::factorization::internal {
 /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
 /// TileElementSize(k, k)
 /// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
-/// get_tfactor_nworkers)
+/// get_tfactor_nworkers), each tile should have the same size as @param tile_t
 ///
 /// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
 /// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.size().getWidth() <= t.get().size().cols()
@@ -99,7 +99,7 @@ void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
 /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
 /// TileElementSize(k, k)
 /// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
-/// get_tfactor_nworkers)
+/// get_tfactor_nworkers), each tile should have the same size as @param tile_t
 /// @param mpi_col_task_chain where internal communications are issued
 ///
 /// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)

diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -132,39 +132,40 @@ struct Helpers<Backend::MC, Device::CPU, T> {
 
              return ex::just(std::make_unique<pika::barrier<>>(nworkers)) |
                     di::continues_on(hp_scheduler) |
-                    ex::bulk(nworkers,
-                             [=, &hh_tiles, &taus, &tile_t, &workspaces](const std::size_t worker_id,
-                                                                         auto& barrier_ptr) mutable {
-                               const SizeType k = taus.get().size().rows();
-
-                               const std::size_t begin = worker_id * batch_size;
-                               const std::size_t end =
-                                   std::min(worker_id * batch_size + batch_size, hh_tiles.size());
-
-                               const matrix::Tile<T, Device::CPU>& ws_worker =
-                                   worker_id == 0 ? tile_t : workspaces[worker_id - 1];
-
-                               tile::internal::set0<T>(ws_worker);
-                               lapack::lacpy(blas::Uplo::General, 1, k, taus.get().ptr(), 1,
-                                             ws_worker.ptr(), ws_worker.ld() + 1);
-
-                               // make it work on worker_id section of tiles
-                               for (std::size_t index = begin; index < end; ++index) {
-                                 const matrix::Tile<const T, Device::CPU>& tile_v =
-                                     hh_tiles[index].get();
-                                 loop_gemv(tile_v, taus.get(), ws_worker);
-                               }
-
-                               barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-                               // reduce ws_T in tile_t
-                               if (worker_id == 0) {
-                                 for (std::size_t other_worker = 1; other_worker < nworkers;
-                                      ++other_worker) {
-                                   tile::internal::add(T(1), workspaces[other_worker - 1], tile_t);
-                                 }
-                               }
-                             }) |
+                    ex::bulk(
+                        nworkers,
+                        [=, &hh_tiles, &taus, &tile_t, &workspaces](const std::size_t worker_id,
+                                                                    auto& barrier_ptr) mutable {
+                          const SizeType k = taus.get().size().rows();
+
+                          const std::size_t begin = worker_id * batch_size;
+                          const std::size_t end =
+                              std::min(worker_id * batch_size + batch_size, hh_tiles.size());
+
+                          const matrix::Tile<T, Device::CPU>& ws_worker =
+                              worker_id == 0 ? tile_t : workspaces[worker_id - 1];
+
+                          DLAF_ASSERT(equal_size(ws_worker, tile_t), ws_worker.size(), tile_t.size());
+
+                          tile::internal::set0<T>(ws_worker);
+                          lapack::lacpy(blas::Uplo::General, 1, k, taus.get().ptr(), 1, ws_worker.ptr(),
+                                        ws_worker.ld() + 1);
+
+                          // make it work on worker_id section of tiles
+                          for (std::size_t index = begin; index < end; ++index) {
+                            const matrix::Tile<const T, Device::CPU>& tile_v = hh_tiles[index].get();
+                            loop_gemv(tile_v, taus.get(), ws_worker);
+                          }
+
+                          barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                          // reduce ws_T in tile_t
+                          if (worker_id == 0) {
+                            for (std::size_t other_worker = 1; other_worker < nworkers; ++other_worker) {
+                              tile::internal::add(T(1), workspaces[other_worker - 1], tile_t);
+                            }
+                          }
+                        }) |
                     // Note: drop the barrier sent by the bulk and return tile_t
                     ex::then([&tile_t](auto&&) mutable { return std::move(tile_t); });
            });
@@ -271,7 +272,9 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
               di::Policy<Backend::GPU>(thread_priority::high),
               [](cublasHandle_t handle, auto&& hh_tiles, auto&& taus,
                  matrix::Tile<T, Device::GPU>& tile_t) {
-                const SizeType k = tile_t.size().cols();
+                const SizeType k = taus.size().rows();
+
+                DLAF_ASSERT(tile_t.size() == TileElementSize(k, k), tile_t.size(), k);
 
                 // Note:
                 // prepare the diagonal of taus in t and reset the rest