JuliaLinearAlgebra
diff --git a/‎Project.toml
Lines changed: 11 additions & 8 deletions b/‎Project.toml
Lines changed: 11 additions & 8 deletions
diff --git a/‎ext/HyperDualNumbersExt.jl
Lines changed: 249 additions & 0 deletions b/‎ext/HyperDualNumbersExt.jl
Lines changed: 249 additions & 0 deletions
diff --git a/‎src/Octavian.jl
Lines changed: 3 additions & 0 deletions b/‎src/Octavian.jl
Lines changed: 3 additions & 0 deletions
@@ -1,11 +1,12 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Chris Elrod", "Dilum Aluthge", "Mason Protter", "contributors"]
-version = "0.3.22"
+version = "0.3.23"
 
 [deps]
 CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97"
 IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667"
@@ -16,9 +17,16 @@ StaticArrayInterface = "0d7ed370-da01-4f52-bd93-41d350b8b718"
 ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
+[weakdeps]
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+
+[extensions]
+ForwardDiffExt = "ForwardDiff"
+
 [compat]
 CPUSummary = "0.1.26, 0.2.1"
 ForwardDiff = "0.10"
+HyperDualNumbers = "4"
 IfElse = "0.1"
 LoopVectorization = "0.12.86"
 ManualMemory = "0.1.1"
@@ -30,13 +38,11 @@ ThreadingUtilities = "0.5"
 VectorizationBase = "0.21.15"
 julia = "1.6"
 
-[extensions]
-ForwardDiffExt = "ForwardDiff"
-
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -45,7 +51,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [targets]
-test = ["Aqua", "BenchmarkTools", "ForwardDiff", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
-
-[weakdeps]
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+test = ["Aqua", "BenchmarkTools", "ForwardDiff", "HyperDualNumbers", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
@@ -0,0 +1,249 @@
+module HyperDualNumbersExt
+
+using HyperDualNumbers: Hyper
+using Octavian: ArrayInterface,
+                 @turbo, @tturbo,
+                 One, Zero,
+                 indices, static
+import Octavian: real_rep, _matmul!, _matmul_serial!
+
+real_rep(a::AbstractArray{DualT}) where {T,DualT<:Hyper{T}} =
+  reinterpret(reshape, T, a)
+_view1(B::AbstractMatrix) = @view(B[1, :])
+_view1(B::AbstractArray{<:Any,3}) = @view(B[1, :, :])
+
+for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
+  # multiplication of dual vector/matrix by standard matrix from the left
+  @eval function _matmul!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    A::AbstractMatrix,
+    _B::$(AbstractVectorOrMatrix){DualT},
+    α,
+    β = Zero(),
+    nthread::Nothing = nothing,
+    MKN = nothing,
+    contig_axis = nothing
+  ) where {T, DualT<:Hyper{T}}
+    B = real_rep(_B)
+    C = real_rep(_C)
+
+    @tturbo for n ∈ indices((C, B), 3),
+      m ∈ indices((C, A), (2, 1)),
+      l in indices((C, B), 1)
+
+      Cₗₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), 2)
+        Cₗₘₙ += A[m, k] * B[l, k, n]
+      end
+      C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+    end
+
+    _C
+  end
+
+  # multiplication of dual matrix by standard vector/matrix from the right
+  @eval @inline function _matmul!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    _A::AbstractMatrix{DualT},
+    B::$(AbstractVectorOrMatrix),
+    α = One(),
+    β = Zero(),
+    nthread::Nothing = nothing,
+    MKN = nothing
+  ) where {T,DualT<:Hyper{T}}
+    if Bool(ArrayInterface.is_dense(_C)) &&
+      Bool(ArrayInterface.is_column_major(_C)) &&
+      Bool(ArrayInterface.is_dense(_A)) &&
+      Bool(ArrayInterface.is_column_major(_A))
+      # we can avoid the reshape and call the standard method
+      A = reinterpret(T, _A)
+      C = reinterpret(T, _C)
+      _matmul!(C, A, B, α, β, nthread, nothing)
+    else
+      # we cannot use the standard method directly
+      A = real_rep(_A)
+      C = real_rep(_C)
+
+      @tturbo for n ∈ indices((C, B), (3, 2)),
+        m ∈ indices((C, A), 2),
+        l in indices((C, A), 1)
+
+        Cₗₘₙ = zero(eltype(C))
+        for k ∈ indices((A, B), (3, 1))
+          Cₗₘₙ += A[l, m, k] * B[k, n]
+        end
+        C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+      end
+    end
+
+    _C
+  end
+
+  @eval @inline function _matmul!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    _A::AbstractMatrix{DualT},
+    _B::$(AbstractVectorOrMatrix){DualT},
+    α = One(),
+    β = Zero(),
+    nthread::Nothing = nothing,
+    MKN = nothing,
+    contig = nothing
+  ) where {T,DualT<:Hyper{T}}
+    A = real_rep(_A)
+    C = real_rep(_C)
+    B = real_rep(_B)
+    if Bool(ArrayInterface.is_dense(_C)) &&
+      Bool(ArrayInterface.is_column_major(_C)) &&
+      Bool(ArrayInterface.is_dense(_A)) &&
+      Bool(ArrayInterface.is_column_major(_A))
+      # we can avoid the reshape and call the standard method
+      Ar = reinterpret(T, _A)
+      Cr = reinterpret(T, _C)
+      _matmul!(Cr, Ar, _view1(B), α, β, nthread, nothing)
+    else
+      # we cannot use the standard method directly
+      @tturbo for n ∈ indices((C, B), 3),
+        m ∈ indices((C, A), 2),
+        l in indices((C, A), 1)
+
+        Cₗₘₙ = zero(eltype(C))
+        for k ∈ indices((A, B), (3, 2))
+          Cₗₘₙ += A[l, m, k] * B[1, k, n]
+        end
+        C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+      end
+    end
+    @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2), p ∈ 1:3
+      Cₚₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), (3, 2))
+        Cₚₘₙ += A[1, m, k] * B[p+1, k, n]
+      end
+      C[p+1, m, n] = C[p+1, m, n] + α * Cₚₘₙ
+    end
+
+    @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2)
+      Cₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), (3, 2))
+        Cₘₙ += A[2, m, k] * B[3, k, n] + A[3, m, k] * B[2, k, n]
+      end
+      C[4, m, n] = C[4, m, n] + α * Cₘₙ
+    end
+    _C
+  end
+
+  # multiplication of dual vector/matrix by standard matrix from the left
+  @eval function _matmul_serial!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    A::AbstractMatrix,
+    _B::$(AbstractVectorOrMatrix){DualT},
+    α,
+    β,
+    MKN
+  ) where {T, DualT<:Hyper{T}}
+    B = real_rep(_B)
+    C = real_rep(_C)
+
+    @turbo for n ∈ indices((C, B), 3),
+      m ∈ indices((C, A), (2, 1)),
+      l in indices((C, B), 1)
+
+      Cₗₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), 2)
+        Cₗₘₙ += A[m, k] * B[l, k, n]
+      end
+      C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+    end
+
+    _C
+  end
+
+# multiplication of dual matrix by standard vector/matrix from the right
+  @eval @inline function _matmul_serial!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    _A::AbstractMatrix{DualT},
+    B::$(AbstractVectorOrMatrix),
+    α,
+    β,
+    MKN
+  ) where {T,DualT<:Hyper{T}}
+    if Bool(ArrayInterface.is_dense(_C)) &&
+      Bool(ArrayInterface.is_column_major(_C)) &&
+      Bool(ArrayInterface.is_dense(_A)) &&
+      Bool(ArrayInterface.is_column_major(_A))
+      # we can avoid the reshape and call the standard method
+      A = reinterpret(T, _A)
+      C = reinterpret(T, _C)
+      _matmul_serial!(C, A, B, α, β, nothing)
+    else
+      # we cannot use the standard method directly
+      A = real_rep(_A)
+      C = real_rep(_C)
+
+      @turbo for n ∈ indices((C, B), (3, 2)),
+        m ∈ indices((C, A), 2),
+        l in indices((C, A), 1)
+
+        Cₗₘₙ = zero(eltype(C))
+        for k ∈ indices((A, B), (3, 1))
+          Cₗₘₙ += A[l, m, k] * B[k, n]
+        end
+        C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+      end
+    end
+
+    _C
+  end
+
+  @eval @inline function _matmul_serial!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    _A::AbstractMatrix{DualT},
+    _B::$(AbstractVectorOrMatrix){DualT},
+    α,
+    β,
+    MKN
+  ) where {T, DualT<:Hyper{T}}
+    A = real_rep(_A)
+    C = real_rep(_C)
+    B = real_rep(_B)
+    if Bool(ArrayInterface.is_dense(_C)) &&
+      Bool(ArrayInterface.is_column_major(_C)) &&
+      Bool(ArrayInterface.is_dense(_A)) &&
+      Bool(ArrayInterface.is_column_major(_A))
+      # we can avoid the reshape and call the standard method
+      Ar = reinterpret(T, _A)
+      Cr = reinterpret(T, _C)
+      _matmul_serial!(Cr, Ar, _view1(B), α, β, nothing)
+    else
+      # we cannot use the standard method directly
+      @turbo for n ∈ indices((C, B), 3),
+        m ∈ indices((C, A), 2),
+        l in indices((C, A), 1)
+
+        Cₗₘₙ = zero(eltype(C))
+        for k ∈ indices((A, B), (3, 2))
+          Cₗₘₙ += A[l, m, k] * B[1, k, n]
+        end
+        C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+      end
+    end
+
+    @turbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2), p ∈ 1:3
+      Cₚₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), (3, 2))
+        Cₚₘₙ += A[1, m, k] * B[p+1, k, n]
+      end
+      C[p+1, m, n] = C[p+1, m, n] + α * Cₚₘₙ
+    end
+
+    @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2)
+      Cₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), (3, 2))
+        Cₘₙ += A[2, m, k] * B[3, k, n] + A[3, m, k] * B[2, k, n]
+      end
+      C[4, m, n] = C[4, m, n] + α * Cₘₙ
+    end
+    _C
+  end
+end # for
+
+end # module
@@ -73,6 +73,9 @@ if !isdefined(Base, :get_extension)
   include("../ext/ForwardDiffExt.jl")
 end
 
+# TODO: confirm when we need this extension
+include("../ext/HyperDualNumbersExt.jl")
+
 @static if VERSION >= v"1.8.0-beta1"
   @setup_workload begin
     # Putting some things in `setup` can reduce the size of the