diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7d80e8de..8b84a26b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,7 +72,7 @@ jobs: include: - build: msrv os: ubuntu-18.04 - rust: 1.49.0 + rust: 1.52.0 - build: stable os: ubuntu-18.04 rust: stable diff --git a/Cargo.toml b/Cargo.toml index 92c6d369..c9a8b90c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ exclude = [ [features] default = ["alga", "multi_thread"] -multi_thread = ["rayon", "num_cpus"] +multi_thread = ["rayon", "num_cpus", "ndarray/rayon"] [dependencies] num-traits = "0.2.0" diff --git a/README.md b/README.md index cbf17450..bf8f2919 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ See the [changelog](changelog.rst). ## Minimum Supported Rust Version -The minimum supported Rust version currently is 1.49. Prior to a 1.0 version, +The minimum supported Rust version currently is 1.52. Prior to a 1.0 version, bumping the MSRV will not be considered a breaking change, but breakage will be avoided on a best effort basis. diff --git a/src/sparse/csmat.rs b/src/sparse/csmat.rs index 6cdcc439..66827cc1 100644 --- a/src/sparse/csmat.rs +++ b/src/sparse/csmat.rs @@ -1901,7 +1901,7 @@ where impl<'a, 'b, N, I, Iptr, IpS, IS, DS, DS2> Mul<&'b ArrayBase> for &'a CsMatBase where - N: 'a + crate::MulAcc + num_traits::Zero + Clone, + N: 'a + crate::MulAcc + num_traits::Zero + Clone + Send + Sync, I: 'a + SpIndex, Iptr: 'a + SpIndex, IpS: 'a + Deref, @@ -1962,7 +1962,13 @@ where impl<'a, 'b, N, I, IpS, IS, DS, DS2> Dot> for ArrayBase where - N: 'a + Clone + crate::MulAcc + num_traits::Zero + std::fmt::Debug, + N: 'a + + Clone + + crate::MulAcc + + num_traits::Zero + + std::fmt::Debug + + Send + + Sync, I: 'a + SpIndex, IpS: 'a + Deref, IS: 'a + Deref, @@ -2013,7 +2019,7 @@ where impl<'a, 'b, N, I, Iptr, IpS, IS, DS, DS2> Dot> for CsMatBase where - N: 'a + Clone + crate::MulAcc + num_traits::Zero, + N: 'a + Clone + crate::MulAcc + num_traits::Zero + Send + Sync, I: 'a + SpIndex, Iptr: 'a + SpIndex, IpS: 'a + Deref, @@ -2031,7 +2037,7 @@ where impl<'a, 'b, N, I, Iptr, IpS, IS, DS, DS2> Mul<&'b ArrayBase> for &'a CsMatBase where - N: 'a + Clone + crate::MulAcc + num_traits::Zero, + N: 'a + Clone + crate::MulAcc + num_traits::Zero + Send + Sync, I: 'a + SpIndex, Iptr: 'a + SpIndex, IpS: 'a + Deref, @@ -2072,7 +2078,7 @@ where impl<'a, 'b, N, I, Iptr, IpS, IS, DS, DS2> Dot> for CsMatBase where - N: 'a + Clone + crate::MulAcc + num_traits::Zero, + N: 'a + Clone + crate::MulAcc + num_traits::Zero + Send + Sync, I: 'a + SpIndex, Iptr: 'a + SpIndex, IpS: 'a + Deref, diff --git a/src/sparse/prod.rs b/src/sparse/prod.rs index 081b4071..85acda8a 100644 --- a/src/sparse/prod.rs +++ b/src/sparse/prod.rs @@ -190,7 +190,9 @@ pub fn csr_mulacc_dense_rowmaj<'a, N, A, B, I, Iptr>( rhs: ArrayView, mut out: ArrayViewMut<'a, N, Ix2>, ) where - N: 'a + crate::MulAcc, + A: Send + Sync, + B: Send + Sync, + N: 'a + crate::MulAcc + Send + Sync, I: 'a + SpIndex, Iptr: 'a + SpIndex, { @@ -200,6 +202,21 @@ pub fn csr_mulacc_dense_rowmaj<'a, N, A, B, I, Iptr>( assert!(lhs.is_csr(), "Storage mismatch"); let axis0 = Axis(0); + #[cfg(feature = "multi_thread")] + for (line, mut oline) in lhs.outer_iterator().zip(out.axis_iter_mut(axis0)) + { + for (col_ind, lval) in line.iter() { + let rline = rhs.row(col_ind); + // TODO: call an axpy primitive to benefit from vectorisation? + ndarray::Zip::from(&mut oline).and(rline).par_for_each( + |oval, rval| { + oval.mul_acc(lval, rval); + }, + ); + } + } + + #[cfg(not(feature = "multi_thread"))] for (line, mut oline) in lhs.outer_iterator().zip(out.axis_iter_mut(axis0)) { for (col_ind, lval) in line.iter() { @@ -220,7 +237,9 @@ pub fn csc_mulacc_dense_rowmaj<'a, N, A, B, I, Iptr>( rhs: ArrayView, mut out: ArrayViewMut<'a, N, Ix2>, ) where - N: 'a + crate::MulAcc, + A: Send + Sync, + B: Send + Sync, + N: 'a + crate::MulAcc + Send + Sync, I: 'a + SpIndex, Iptr: 'a + SpIndex, { @@ -229,6 +248,19 @@ pub fn csc_mulacc_dense_rowmaj<'a, N, A, B, I, Iptr>( assert_eq!(rhs.shape()[1], out.shape()[1], "Dimension mismatch"); assert!(lhs.is_csc(), "Storage mismatch"); + #[cfg(feature = "multi_thread")] + for (lcol, rline) in lhs.outer_iterator().zip(rhs.outer_iter()) { + for (orow, lval) in lcol.iter() { + let oline = out.row_mut(orow); + ndarray::Zip::from(oline) + .and(rline) + .par_for_each(|oval, rval| { + oval.mul_acc(lval, rval); + }); + } + } + + #[cfg(not(feature = "multi_thread"))] for (lcol, rline) in lhs.outer_iterator().zip(rhs.outer_iter()) { for (orow, lval) in lcol.iter() { let mut oline = out.row_mut(orow); @@ -247,7 +279,9 @@ pub fn csc_mulacc_dense_colmaj<'a, N, A, B, I, Iptr>( rhs: ArrayView, mut out: ArrayViewMut<'a, N, Ix2>, ) where - N: 'a + crate::MulAcc, + A: Send + Sync, + B: Send + Sync, + N: 'a + crate::MulAcc + Send + Sync, I: 'a + SpIndex, Iptr: 'a + SpIndex, { @@ -257,6 +291,20 @@ pub fn csc_mulacc_dense_colmaj<'a, N, A, B, I, Iptr>( assert!(lhs.is_csc(), "Storage mismatch"); let axis1 = Axis(1); + // NOTE: See csr_mulacc_dense_colmaj, same issue + #[cfg(feature = "multi_thread")] + ndarray::Zip::from(out.axis_iter_mut(axis1)) + .and(rhs.axis_iter(axis1)) + .par_for_each(|mut ocol, rcol| { + for (rrow, lcol) in lhs.outer_iterator().enumerate() { + let rval = &rcol[[rrow]]; + for (orow, lval) in lcol.iter() { + ocol[[orow]].mul_acc(lval, rval); + } + } + }); + + #[cfg(not(feature = "multi_thread"))] for (mut ocol, rcol) in out.axis_iter_mut(axis1).zip(rhs.axis_iter(axis1)) { for (rrow, lcol) in lhs.outer_iterator().enumerate() { let rval = &rcol[[rrow]]; @@ -275,7 +323,9 @@ pub fn csr_mulacc_dense_colmaj<'a, N, A, B, I, Iptr>( rhs: ArrayView, mut out: ArrayViewMut<'a, N, Ix2>, ) where - N: 'a + crate::MulAcc, + A: Send + Sync, + B: Send + Sync, + N: 'a + crate::MulAcc + Send + Sync, I: 'a + SpIndex, Iptr: 'a + SpIndex, { @@ -285,6 +335,23 @@ pub fn csr_mulacc_dense_colmaj<'a, N, A, B, I, Iptr>( assert!(lhs.is_csr(), "Storage mismatch"); let axis1 = Axis(1); + // NOTE: This is parallel over the columns of the output and rhs + // which isn't ideal. This is still sequential for dense vector product. + // Ideally CsMat.outer_iterator() should get a par_iter rayon impl + #[cfg(feature = "multi_thread")] + ndarray::Zip::from(out.axis_iter_mut(axis1)) + .and(rhs.axis_iter(axis1)) + .par_for_each(|mut ocol, rcol| { + for (orow, lrow) in lhs.outer_iterator().enumerate() { + let oval = &mut ocol[[orow]]; + for (rrow, lval) in lrow.iter() { + let rval = &rcol[[rrow]]; + oval.mul_acc(lval, rval); + } + } + }); + + #[cfg(not(feature = "multi_thread"))] for (mut ocol, rcol) in out.axis_iter_mut(axis1).zip(rhs.axis_iter(axis1)) { for (orow, lrow) in lhs.outer_iterator().enumerate() { let oval = &mut ocol[[orow]];