From 1b9c97ec95a154d1a52d7b2a570cef162d4c3ae8 Mon Sep 17 00:00:00 2001 From: Chris Elrod Date: Sat, 28 Mar 2020 19:26:30 -0400 Subject: [PATCH] Change handling of reductions to better accomdate their use within the loop body, and don't unroll inner reductions where the unrolled loop is the reduction loop if that reduction is already tiled. --- Manifest.toml | 8 +-- Project.toml | 6 +-- docs/src/assets/bench_AmulB_v1.svg | 2 +- docs/src/assets/bench_AmulBt_v1.svg | 2 +- docs/src/assets/bench_Amulvb_v1.svg | 2 +- docs/src/assets/bench_AplusAt_v1.svg | 2 +- docs/src/assets/bench_AtmulB_v1.svg | 2 +- docs/src/assets/bench_AtmulBt_v1.svg | 2 +- docs/src/assets/bench_Atmulvb_v1.svg | 2 +- docs/src/assets/bench_aplusBc_v1.svg | 2 +- docs/src/assets/bench_dot3_v1.svg | 2 +- docs/src/assets/bench_dot_v1.svg | 2 +- docs/src/assets/bench_exp_v1.svg | 2 +- docs/src/assets/bench_filter2d_3x3_v1.svg | 2 +- docs/src/assets/bench_filter2d_dynamic_v1.svg | 2 +- .../src/assets/bench_filter2d_unrolled_v1.svg | 2 +- docs/src/assets/bench_logdettriangle_v1.svg | 2 +- docs/src/assets/bench_random_access_v1.svg | 2 +- docs/src/assets/bench_selfdot_v1.svg | 2 +- docs/src/assets/bench_sse_v1.svg | 2 +- src/add_compute.jl | 37 ++++++++++---- src/costs.jl | 12 ++--- src/determinestrategy.jl | 30 +++++++----- src/graphs.jl | 37 -------------- src/lower_compute.jl | 36 ++++++++++---- src/lower_constant.jl | 49 ++++++++++++++----- src/lower_store.jl | 46 +++++++++++++++-- src/lowering.jl | 12 +++++ src/operation_evaluation_order.jl | 6 ++- test/gemm.jl | 43 ++++++++++++++++ test/gemv.jl | 4 +- test/miscellaneous.jl | 2 +- test/offsetarrays.jl | 22 ++++----- test/printmethods.jl | 2 +- 34 files changed, 254 insertions(+), 134 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 8f50b337e..7557d0168 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -42,9 +42,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[SIMDPirates]] deps = ["VectorizationBase"] -git-tree-sha1 = "1a9cbe1be1f5d43ac49eeb38ca64dd78e50a0cc6" +git-tree-sha1 = "8f89aa38f5e4e89f2a474ffdc850fc21d6ab9ed4" uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a" -version = "0.7.3" +version = "0.7.4" [[SLEEFPirates]] deps = ["Libdl", "SIMDPirates", "VectorizationBase"] @@ -69,6 +69,6 @@ version = "0.1.0" [[VectorizationBase]] deps = ["CpuId", "LinearAlgebra"] -git-tree-sha1 = "83e32d4835fc4f4ecfd43eb59fa7fc00854b3d41" +git-tree-sha1 = "2a83ab02d3fb6ad5de0c6d04103b0ca403d9a7d8" uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" -version = "0.9.4" +version = "0.9.5" diff --git a/Project.toml b/Project.toml index 532741bb7..b26f9a2c4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LoopVectorization" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" authors = ["Chris Elrod "] -version = "0.6.23" +version = "0.6.24" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -13,10 +13,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [compat] OffsetArrays = "1" -SIMDPirates = "0.7.3" +SIMDPirates = "0.7.4" SLEEFPirates = "0.4" UnPack = "0" -VectorizationBase = "0.9.3" +VectorizationBase = "0.9.5" julia = "1.1" [extras] diff --git a/docs/src/assets/bench_AmulB_v1.svg b/docs/src/assets/bench_AmulB_v1.svg index 6b7362732..a9cf7b8d9 100644 --- a/docs/src/assets/bench_AmulB_v1.svg +++ b/docs/src/assets/bench_AmulB_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120130GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120130GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_AmulBt_v1.svg b/docs/src/assets/bench_AmulBt_v1.svg index 907aa5a16..7449fc426 100644 --- a/docs/src/assets/bench_AmulBt_v1.svg +++ b/docs/src/assets/bench_AmulBt_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120130GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120130GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_Amulvb_v1.svg b/docs/src/assets/bench_Amulvb_v1.svg index 032e45699..536337d3c 100644 --- a/docs/src/assets/bench_Amulvb_v1.svg +++ b/docs/src/assets/bench_Amulvb_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size051015202530354045505560657075808590GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_AplusAt_v1.svg b/docs/src/assets/bench_AplusAt_v1.svg index 162bca272..f75c1090f 100644 --- a/docs/src/assets/bench_AplusAt_v1.svg +++ b/docs/src/assets/bench_AplusAt_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.51.01.52.02.53.03.54.04.55.0GFLOPSClang-PollyGFortranGFortran-builtinJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-builtinMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.51.01.52.02.53.03.54.04.55.0GFLOPSClang-PollyGFortranGFortran-builtinJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-builtinMethod diff --git a/docs/src/assets/bench_AtmulB_v1.svg b/docs/src/assets/bench_AtmulB_v1.svg index 440850e82..7601dc632 100644 --- a/docs/src/assets/bench_AtmulB_v1.svg +++ b/docs/src/assets/bench_AtmulB_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_AtmulBt_v1.svg b/docs/src/assets/bench_AtmulBt_v1.svg index 998a1f6d9..9a317fb6d 100644 --- a/docs/src/assets/bench_AtmulBt_v1.svg +++ b/docs/src/assets/bench_AtmulBt_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_Atmulvb_v1.svg b/docs/src/assets/bench_Atmulvb_v1.svg index 18dc5c72c..56545f80e 100644 --- a/docs/src/assets/bench_Atmulvb_v1.svg +++ b/docs/src/assets/bench_Atmulvb_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size02468101214161820222426283032343638404244GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_aplusBc_v1.svg b/docs/src/assets/bench_aplusBc_v1.svg index 0d2826546..2bf62ba97 100644 --- a/docs/src/assets/bench_aplusBc_v1.svg +++ b/docs/src/assets/bench_aplusBc_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size024681012141618202224262830GFLOPSClang-PollyGFortranJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size024681012141618202224262830GFLOPSClang-PollyGFortranJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/docs/src/assets/bench_dot3_v1.svg b/docs/src/assets/bench_dot3_v1.svg index 7cd086e7f..0b7509c98 100644 --- a/docs/src/assets/bench_dot3_v1.svg +++ b/docs/src/assets/bench_dot3_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0510152025303540455055606570GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0510152025303540455055606570GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/docs/src/assets/bench_dot_v1.svg b/docs/src/assets/bench_dot_v1.svg index 8671b2ea5..fe87d26ff 100644 --- a/docs/src/assets/bench_dot_v1.svg +++ b/docs/src/assets/bench_dot_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size02468101214161820222426283032343638404244GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size02468101214161820222426283032343638404244GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/docs/src/assets/bench_exp_v1.svg b/docs/src/assets/bench_exp_v1.svg index 50c15b320..0f5284a6f 100644 --- a/docs/src/assets/bench_exp_v1.svg +++ b/docs/src/assets/bench_exp_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.20.40.60.81.01.21.41.61.82.02.22.42.6GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.20.40.60.81.01.21.41.61.82.02.22.42.6GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_filter2d_3x3_v1.svg b/docs/src/assets/bench_filter2d_3x3_v1.svg index 54c9ce0bd..0615ac2f9 100644 --- a/docs/src/assets/bench_filter2d_3x3_v1.svg +++ b/docs/src/assets/bench_filter2d_3x3_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_filter2d_dynamic_v1.svg b/docs/src/assets/bench_filter2d_dynamic_v1.svg index 5998eb96d..8684ca309 100644 --- a/docs/src/assets/bench_filter2d_dynamic_v1.svg +++ b/docs/src/assets/bench_filter2d_dynamic_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0510152025303540455055GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size051015202530354045505560GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_filter2d_unrolled_v1.svg b/docs/src/assets/bench_filter2d_unrolled_v1.svg index 44a23a19f..7149680ba 100644 --- a/docs/src/assets/bench_filter2d_unrolled_v1.svg +++ b/docs/src/assets/bench_filter2d_unrolled_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_logdettriangle_v1.svg b/docs/src/assets/bench_logdettriangle_v1.svg index bd9672bff..ba6c061d5 100644 --- a/docs/src/assets/bench_logdettriangle_v1.svg +++ b/docs/src/assets/bench_logdettriangle_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.10.20.30.40.50.60.70.80.91.01.11.21.3GFLOPSClang-PollyGFortranJuliaJulia-builtinLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.10.20.30.40.50.60.70.80.91.01.11.21.3GFLOPSClang-PollyGFortranJuliaJulia-builtinLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_random_access_v1.svg b/docs/src/assets/bench_random_access_v1.svg index d64e3e78e..09da8c145 100644 --- a/docs/src/assets/bench_random_access_v1.svg +++ b/docs/src/assets/bench_random_access_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.20.40.60.81.01.21.41.61.82.02.22.42.62.83.03.23.4GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.20.40.60.81.01.21.41.61.82.02.22.42.62.83.03.23.43.63.84.0GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_selfdot_v1.svg b/docs/src/assets/bench_selfdot_v1.svg index 5939989b8..909d0dbd1 100644 --- a/docs/src/assets/bench_selfdot_v1.svg +++ b/docs/src/assets/bench_selfdot_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size051015202530354045505560GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0510152025303540455055GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/docs/src/assets/bench_sse_v1.svg b/docs/src/assets/bench_sse_v1.svg index 477351027..fc61824c7 100644 --- a/docs/src/assets/bench_sse_v1.svg +++ b/docs/src/assets/bench_sse_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/src/add_compute.jl b/src/add_compute.jl index 56ac30dbc..95a2291f7 100644 --- a/src/add_compute.jl +++ b/src/add_compute.jl @@ -33,7 +33,7 @@ function setdiffv!(s4::AbstractVector{T}, s3::AbstractVector{T}, s1::AbstractVec end end function update_deps!(deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, parent::Operation) - mergesetv!(deps, loopdependencies(parent))#, reduceddependencies(parent)) + mergesetv!(deps, loopdependencies(parent))#, reduceddependencies(parent)) if !(isload(parent) || isconstant(parent)) && !isreductcombineinstr(parent) mergesetv!(reduceddeps, reduceddependencies(parent)) end @@ -101,6 +101,18 @@ function isreductzero(op::Operation, ls::LoopSet, reduct_zero::Symbol) false end +# function substitute_op_in_parents!(vparents::Vector{Operation}, replacer::Operation, replacee::Operation) +# for i ∈ eachindex(vparents) +# opp = vparents[i] +# if opp === replacee +# vparents[i] = replacer +# else +# substitute_op_in_parents!(parents(opp), replacer, replacee) +# end +# end +# end + + function add_reduction_update_parent!( vparents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, parent::Operation, instr::Symbol, directdependency::Bool, elementbytes::Int @@ -109,11 +121,14 @@ function add_reduction_update_parent!( isouterreduction = parent.instruction === LOOPCONSTANT Instr = instruction(ls, instr) instrclass = reduction_instruction_class(Instr) # key allows for faster lookups + reduct_zero = reduction_zero(instrclass) # if parent is not an outer reduction... - if !isouterreduction - # and parent is not a reduction_zero - reduct_zero = reduction_zero(instrclass) + # if !isouterreduction && !isreductzero(parent, ls, reduct_zero) + add_reduct_instruct = !isouterreduction && !isconstant(parent) + if add_reduct_instruct + # We add reductcombine = reduction_scalar_combine(instrclass) + # reductcombine = :identity reductsym = gensym(:reduction) reductinit = add_constant!(ls, gensym(:reductzero), loopdependencies(parent), reductsym, elementbytes, :numericconstant) if reduct_zero === :zero @@ -128,13 +143,13 @@ function add_reduction_update_parent!( end pushpreamble!(ls, op, name, reductinit) end - if isreductzero(parent, ls, reduct_zero) - reductcombine = reduction_combine_to(instrclass) - end + # if + # reductcombine = reduction_combine_to(instrclass) + # end else reductinit = parent reductsym = var - reductcombine = Symbol("") + reductcombine = :identity#Symbol("") end combineddeps = copy(deps); mergesetv!(combineddeps, reduceddeps) # directdependency && pushparent!(vparents, deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint @@ -145,6 +160,8 @@ function add_reduction_update_parent!( else push!(vparents, reductinit) end + # elseif !isouterreduction + # substitute_op_in_parents!(vparents, reductinit, parent) end update_reduction_status!(vparents, reduceddeps, name(reductinit)) # this is the op added by add_compute @@ -152,9 +169,11 @@ function add_reduction_update_parent!( parent.instruction === LOOPCONSTANT && push!(ls.outer_reductions, identifier(op)) opout = pushop!(ls, op, var) # note this overwrites the entry in the operations dict, but not the vector # isouterreduction || iszero(length(reduceddeps)) && return opout + # return opout isouterreduction && return opout # create child op, which is the reduction combination - childrdeps = Symbol[]; childparents = Operation[ op, parent ] + childrdeps = Symbol[]; childparents = Operation[ op ]#, parent ] + add_reduct_instruct && push!(childparents, parent) childdeps = loopdependencies(reductinit) setdiffv!(childrdeps, loopdependencies(op), childdeps) child = Operation( diff --git a/src/costs.jl b/src/costs.jl index ee4fa28c8..1a2bd9188 100644 --- a/src/costs.jl +++ b/src/costs.jl @@ -198,9 +198,9 @@ const COST = Dict{Instruction,InstructionCost}( Instruction(:adjoint) => InstructionCost(0,0.0,0.0,0), Instruction(:transpose) => InstructionCost(0,0.0,0.0,0), Instruction(:prefetch) => InstructionCost(0,0.0,0.0,0), + Instruction(:prefetch0) => InstructionCost(0,0.0,0.0,0), Instruction(:prefetch1) => InstructionCost(0,0.0,0.0,0), - Instruction(:prefetch2) => InstructionCost(0,0.0,0.0,0), - Instruction(:prefetch3) => InstructionCost(0,0.0,0.0,0) + Instruction(:prefetch2) => InstructionCost(0,0.0,0.0,0) ) @inline prefetch0(x, i) = SIMDPirates.prefetch(gep(stridedpointer(x), (extract_data(i) - 1,)), Val{3}(), Val{0}()) @inline prefetch0(x, i, j) = SIMDPirates.prefetch(gep(stridedpointer(x), (extract_data(i) - 1, extract_data(j) - 1)), Val{3}(), Val{0}()) @@ -274,7 +274,7 @@ reduction_instruction_class(instr::Symbol) = get(REDUCTION_CLASS, instr, NaN) reduction_instruction_class(instr::Instruction) = reduction_instruction_class(instr.instr) function reduction_to_single_vector(x::Float64) # x == 1.0 ? :evadd : x == 2.0 ? :evmul : x == 3.0 ? :vor : x == 4.0 ? :vand : x == 5.0 ? :max : x == 6.0 ? :min : throw("Reduction not found.") - x == 1.0 ? :evadd : x == 2.0 ? :evmul : x == 5.0 ? :max : x == 6.0 ? :min : throw("Reduction not found.") + x == ADDITIVE_IN_REDUCTIONS ? :evadd : x == MULTIPLICATIVE_IN_REDUCTIONS ? :evmul : x == MAX ? :max : x == MIN ? :min : throw("Reduction not found.") end reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction_class(x)) # function reduction_to_scalar(x::Float64) @@ -284,17 +284,17 @@ reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction # reduction_to_scalar(x) = reduction_to_scalar(reduction_instruction_class(x)) function reduction_scalar_combine(x::Float64) # x == 1.0 ? :reduced_add : x == 2.0 ? :reduced_prod : x == 3.0 ? :reduced_any : x == 4.0 ? :reduced_all : x == 5.0 ? :reduced_max : x == 6.0 ? :reduced_min : throw("Reduction not found.") - x == 1.0 ? :reduced_add : x == 2.0 ? :reduced_prod : x == 5.0 ? :reduced_max : x == 6.0 ? :reduced_min : throw("Reduction not found.") + x == ADDITIVE_IN_REDUCTIONS ? :reduced_add : x == MULTIPLICATIVE_IN_REDUCTIONS ? :reduced_prod : x == MAX ? :reduced_max : x == MIN ? :reduced_min : throw("Reduction not found.") end reduction_scalar_combine(x) = reduction_scalar_combine(reduction_instruction_class(x)) function reduction_combine_to(x::Float64) # x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 3.0 ? :reduce_to_any : x == 4.0 ? :reduce_to_all : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.") - x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.") + x == ADDITIVE_IN_REDUCTIONS ? :reduce_to_add : x == MULTIPLICATIVE_IN_REDUCTIONS ? :reduce_to_prod : x == MAX ? :reduce_to_max : x == MIN ? :reduce_to_min : throw("Reduction not found.") end reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x)) function reduction_zero(x::Float64) # x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.") - x == 1.0 ? :zero : x == 2.0 ? :one : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.") + x == ADDITIVE_IN_REDUCTIONS ? :zero : x == MULTIPLICATIVE_IN_REDUCTIONS ? :one : x == MAX ? :typemin : x == MIN ? :typemax : throw("Reduction not found.") end reduction_zero(x) = reduction_zero(reduction_instruction_class(x)) diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl index ed535ee52..bfc26f949 100644 --- a/src/determinestrategy.jl +++ b/src/determinestrategy.jl @@ -50,18 +50,18 @@ function register_pressure(op::Operation) end end function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int = op.elementbytes) - isconstant(op) && return 0.0, 0, Int(length(loopdependencies(op)) > 0) - isloopvalue(op) && return 0.0, 0, 0 + isconstant(op) && return 0.0, 0, Float64(length(loopdependencies(op)) > 0) + isloopvalue(op) && return 0.0, 0, 0.0 # Wshift == dependson(op, vectorized) ? Wshift : 0 # c = first(cost(instruction(op), Wshift, size_T))::Int instr = Instruction(:LoopVectorization, instruction(op).instr) # instr = instruction(op) if length(parents(op)) == 1 if instr == Instruction(:-) || instr === Instruction(:vsub) || instr == Instruction(:+) || instr == Instruction(:vadd) - return 0.0, 0, 0 + return 0.0, 0, 0.0 end elseif iscompute(op) && all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op)) - return 0.0, 0, 0 + return 0.0, 0, 0.0 end opisvectorized = dependson(op, vectorized) srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr) @@ -80,7 +80,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_ sl *= 3 end end - srt, sl, srp + srt, sl, Float64(srp) end # Base._return_type() @@ -336,7 +336,7 @@ function maybedemotesize(U::Int, N::Int) um1rep > urep ? U : Um1 end function maybedemotesize(T::Int, N::Int, U::Int, Uloop::Loop, maxTbase::Int) - T > 1 || return T + T > 1 || return 1 T == N && return T T = maybedemotesize(T, N) if !(isstaticloop(Uloop) && length(Uloop) == U) @@ -451,11 +451,11 @@ function convolution_cost_factor(ls::LoopSet, op::Operation, u1::Symbol, u2::Sym if isstaticloop(loop) && length(loop) ≤ 4 itersym = loop.itersymbol if itersym !== u1 && itersym !== u2 - return (0.25, 1.0) + return (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 2.0 : 1.0) end end end - (0.25, 0.5) + (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 1.2 : 1.0) else (1.0, 1.0) end @@ -529,7 +529,7 @@ function evaluate_cost_tile( factor1, factor2 = convolution_cost_factor(ls, op, unrolled, tiled, vectorized) rt *= factor1; rp *= factor2; end - # @show op rt, lat, rp + # @show isunrolled, istiled, op rt, lat, rp rp = opisininnerloop ? rp : 0 # we only care about register pressure within the inner most loop rt *= iters[id] if isunrolled && istiled # no cost decrease; cost must be repeated @@ -546,6 +546,7 @@ function evaluate_cost_tile( reg_pressure[4] += rp end end + # @show reg_pressure costpenalty = (sum(reg_pressure) > VectorizationBase.REGISTER_COUNT) ? 2 : 1 # @show order, vectorized cost_vec reg_pressure # @show solve_tilesize(ls, unrolled, tiled, cost_vec, reg_pressure) @@ -665,12 +666,15 @@ function choose_order(ls::LoopSet) end end -function register_pressure(ls::LoopSet) - order, unroll, vec, U, T = choose_order(ls) +function register_pressure(ls::LoopSet, U, T) if T == -1 sum(register_pressure, operations(ls)) else - rp = @view ls.reg_pressure[:,1] - tU * tT * rp[1] + tU * rp[2] + rp[3] + rp[4] + rp = @view ls.reg_pres[:,1] + U * T * rp[1] + U * rp[2] + rp[3] + rp[4] end end +function register_pressure(ls::LoopSet) + order, unroll, tile, vec, U, T = choose_order(ls) + register_pressure(ls, U, T) +end diff --git a/src/graphs.jl b/src/graphs.jl index be02ce3d6..8146b991d 100644 --- a/src/graphs.jl +++ b/src/graphs.jl @@ -1,41 +1,4 @@ -# """ -# ShortVector{T} simply wraps a Vector{T}, but uses a different hash function that is faster for short vectors to support using it as the keys of a Dict. -# This hash function scales O(N) with length of the vectors, so it is slow for long vectors. -# """ -# struct ShortVector{T} <: DenseVector{T} -# data::Vector{T} -# end -# Base.@propagate_inbounds Base.getindex(x::ShortVector, I...) = x.data[I...] -# Base.@propagate_inbounds Base.setindex!(x::ShortVector, v, I...) = x.data[I...] = v -# ShortVector{T}(::UndefInitializer, N::Integer) where {T} = ShortVector{T}(Vector{T}(undef, N)) -# @inbounds Base.length(x::ShortVector) = length(x.data) -# @inbounds Base.size(x::ShortVector) = size(x.data) -# @inbounds Base.strides(x::ShortVector) = strides(x.data) -# @inbounds Base.push!(x::ShortVector, v) = push!(x.data, v) -# @inbounds Base.append!(x::ShortVector, v) = append!(x.data, v) -# function Base.hash(x::ShortVector, h::UInt) -# @inbounds for n ∈ eachindex(x) -# h = hash(x[n], h) -# end -# h -# end -# function Base.isequal(a::ShortVector{T}, b::ShortVector{T}) where {T} -# length(a) == length(b) || return false -# @inbounds for i ∈ 1:length(a) -# a[i] === b[i] || return false -# end -# true -# end -# Base.convert(::Type{Vector}, sv::ShortVector) = sv.data -# Base.convert(::Type{Vector{T}}, sv::ShortVector{T}) where {T} = sv.data - - -# For passing options like array types and mask -# struct LoopSetOptions - -# end - struct UnrollSpecification unrolledloopnum::Int tiledloopnum::Int diff --git a/src/lower_compute.jl b/src/lower_compute.jl index 48f6655f0..796db5877 100644 --- a/src/lower_compute.jl +++ b/src/lower_compute.jl @@ -8,9 +8,10 @@ function lower_compute!( opunrolled = unrolled ∈ loopdependencies(op) ) - var = op.variable - mvar = mangledvar(op) + var = name(op) + instr = instruction(op) parents_op = parents(op) + mvar = mangledvar(op) nparents = length(parents_op) parentstiled = if suffix === nothing optiled = false @@ -25,13 +26,20 @@ function lower_compute!( optiled = true [tiled ∈ loopdependencies(opp) for opp ∈ parents_op] end - parentsunrolled = [unrolled ∈ loopdependencies(opp) || unrolled ∈ reducedchildren(opp) for opp ∈ parents_op] + parentsunrolled = isunrolled_sym.(parents_op, unrolled, tiled) + if instr.instr === :identity && name(first(parents_op)) === var && isone(length(parents_op)) + if (opunrolled == first(parentsunrolled)) && ((!isnothing(suffix)) == first(parentstiled)) + return + end + end + unrollsym = isunrolled_sym(op, unrolled, suffix) if !opunrolled && any(parentsunrolled) parents_op = copy(parents_op) for i ∈ eachindex(parentsunrolled) parentsunrolled[i] || continue parentsunrolled[i] = false parentop = parents_op[i] + # @show op, parentop newparentop = Operation( parentop.identifier, gensym(parentop.variable), parentop.elementbytes, parentop.instruction, parentop.node_type, parentop.dependencies, parentop.reduced_deps, parentop.parents, parentop.ref, parentop.reduced_children @@ -43,14 +51,18 @@ function lower_compute!( parentname = Symbol(parentname, suffix_) newparentname = Symbol(newparentname, suffix_) end - for u ∈ 0:U-1 - push!(q.args, Expr(:(=), Symbol(newparentname, u), Symbol(parentname, u))) + if isconstant(newparentop) + push!(q.args, Expr(:(=), newparentname, Symbol(parentname, 0))) + continue + else + for u ∈ 0:U-1 + push!(q.args, Expr(:(=), Symbol(newparentname, u), Symbol(parentname, u))) + end + reduce_expr!(q, newparentname, Instruction(reduction_to_single_vector(instruction(newparentop))), U) + push!(q.args, Expr(:(=), newparentname, Symbol(newparentname, 0))) end - reduce_expr!(q, newparentname, Instruction(reduction_to_single_vector(instruction(newparentop))), U) - push!(q.args, Expr(:(=), newparentname, Symbol(newparentname, 0))) end end - instr = op.instruction # cache unroll and tiling check of parents # not broadcasted, because we use frequent checks of individual bools # making BitArrays inefficient. @@ -84,7 +96,7 @@ function lower_compute!( # modsuffix = ((u + suffix*U) & 3) modsuffix = (suffix & 3) Symbol(mvar, modsuffix) - elseif opunrolled + elseif unrollsym Symbol(mvar, u) else mvar @@ -125,7 +137,11 @@ function lower_compute!( continue end end - push!(q.args, Expr(:(=), varsym, instrcall)) + if instr.instr === :identity && isone(length(parents_op)) + push!(q.args, Expr(:(=), varsym, instrcall.args[2])) + else + push!(q.args, Expr(:(=), varsym, instrcall)) + end end end diff --git a/src/lower_constant.jl b/src/lower_constant.jl index c9e6f5c82..b4fe77919 100644 --- a/src/lower_constant.jl +++ b/src/lower_constant.jl @@ -24,15 +24,27 @@ function lower_zero!( else call = Expr(:call, :zero, typeT) end - if unrolled ∈ loopdependencies(op) || unrolled ∈ reducedchildren(op) || unrolled ∈ reduceddependencies(op) + if isunrolled_sym(op, unrolled, suffix) + broadcastsym = Symbol(mvar, "_#init#") + push!(q.args, Expr(:(=), broadcastsym, call)) for u ∈ 0:U-1 - push!(q.args, Expr(:(=), Symbol(mvar, u), call)) + push!(q.args, Expr(:(=), Symbol(mvar, u), broadcastsym)) end else push!(q.args, Expr(:(=), mvar, call)) end nothing end +# Have to awkwardly search through `operations(ls)` to try and find op's child +function getparentsreductzero(ls::LoopSet, op::Operation)::Float64 + opname = name(op) + for opp ∈ operations(ls) + if name(opp) === opname && opp !== op && iscompute(opp) && search_tree(parents(opp), opname) && length(reduceddependencies(opp)) > 0 + return reduction_instruction_class(instruction(opp)) + end + end + throw("Reduct zero not found.") +end function lower_constant!( q::Expr, op::Operation, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, U::Int, suffix::Union{Nothing,Int} ) @@ -40,24 +52,37 @@ function lower_constant!( instruction = op.instruction mvar = variable_name(op, suffix) constsym = instruction.instr - if vectorized ∈ loopdependencies(op) || vectorized ∈ reducedchildren(op) || vectorized ∈ reduceddependencies(op) + reducedchildvectorized = vectorized ∈ reducedchildren(op) + unroll = isunrolled_sym(op, unrolled, suffix) + if reducedchildvectorized || vectorized ∈ loopdependencies(op) || vectorized ∈ reduceddependencies(op) # call = Expr(:call, lv(:vbroadcast), W, Expr(:call, lv(:maybeconvert), typeT, constsym)) - call = Expr(:call, lv(:vbroadcast), W, constsym) - if unrolled ∈ loopdependencies(op) || unrolled ∈ reducedchildren(op) || unrolled ∈ reduceddependencies(op) - for u ∈ 0:U-1 - push!(q.args, Expr(:(=), Symbol(mvar, u), call)) + call = if reducedchildvectorized && vectorized ∉ loopdependencies(op) + instrclass = getparentsreductzero(ls, op) + if instrclass == ADDITIVE_IN_REDUCTIONS + Expr(:call, Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:SIMDPirates)), QuoteNode(:addscalar)), Expr(:call, lv(:vzero), W, typeT), constsym) + elseif instrclass == MULTIPLICATIVE_IN_REDUCTIONS + Expr(:call, Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:SIMDPirates)), QuoteNode(:mulscalar)), Expr(:call, lv(:vbroadcast), W, Expr(:call, :one, typeT)), constsym) + else + throw("Reductions of type $(reduction_zero(reinstrclass)) not yet supported; please file an issue as a reminder to take care of this.") end else - push!(q.args, Expr(:(=), mvar, call)) + Expr(:call, lv(:vbroadcast), W, constsym) end - else - if unrolled ∈ loopdependencies(op) || unrolled ∈ reducedchildren(op) || unrolled ∈ reduceddependencies(op) + if unroll + broadcastsym = Symbol(mvar, "_#init#") + push!(q.args, Expr(:(=), broadcastsym, call)) for u ∈ 0:U-1 - push!(q.args, Expr(:(=), Symbol(mvar, u), constsym)) + push!(q.args, Expr(:(=), Symbol(mvar, u), broadcastsym)) end else - push!(q.args, Expr(:(=), mvar, constsym)) + push!(q.args, Expr(:(=), mvar, call)) end + elseif unroll + for u ∈ 0:U-1 + push!(q.args, Expr(:(=), Symbol(mvar, u), constsym)) + end + else + push!(q.args, Expr(:(=), mvar, constsym)) end nothing end diff --git a/src/lower_store.jl b/src/lower_store.jl index 43d555c4b..4e385d94c 100644 --- a/src/lower_store.jl +++ b/src/lower_store.jl @@ -1,5 +1,41 @@ using VectorizationBase: vnoaliasstore! -const STOREOP = :vnoaliasstore! +# const STOREOP = :vnoaliasstore! + +@inline vstoreadditivereduce!(args...) = vnoaliasstore!(args...) +@inline vstoremultiplicativevereduce!(args...) = vnoaliasstore!(args...) +@inline function vstoreadditivereduce!(ptr::VectorizationBase.AbstractStridedPointer, v::VectorizationBase.SVec, i::NTuple{N,<:Integer}) where {N} + vnoaliasstore!(ptr, SIMDPirates.vsum(v), i) +end +@inline function vstoreadditivereduce!(ptr::VectorizationBase.AbstractStridedPointer, v::VectorizationBase.SVec, i::NTuple{N,<:Integer}, m::VectorizationBase.Mask) where {N} + vnoaliasstore!(ptr, SIMDPirates.vsum(v), i, m) +end +@inline function vstoremultiplicativevereduce!(ptr::VectorizationBase.AbstractStridedPointer, v::VectorizationBase.SVec, i::NTuple{N,<:Integer}) where {N} + vnoaliasstore!(ptr, SIMDPirates.vprod(v), i) +end +@inline function vstoremultiplicativevereduce!(ptr::VectorizationBase.AbstractStridedPointer, v::VectorizationBase.SVec, i::NTuple{N,<:Integer}, m::VectorizationBase.Mask) where {N} + vnoaliasstore!(ptr, SIMDPirates.vprod(v), i, m) +end + +function storeinstr(op::Operation) + opp = first(parents(op)) + if instruction(opp).instr === :identity + opp = first(parents(opp)) + end + instr = if iszero(length(reduceddependencies(opp))) + :vnoaliasstore! + else + instr_class = reduction_instruction_class(instruction(opp)) + if instr_class === ADDITIVE_IN_REDUCTIONS + :vstoreadditivereduce! + elseif instr_class === MULTIPLICATIVE_IN_REDUCTIONS + :vstoremultiplicativevereduce! + else #FIXME + :vnoaliasstore! + end + end + lv(instr) +end + # const STOREOP = :vstore! variable_name(op::Operation, ::Nothing) = mangledvar(op) variable_name(op::Operation, suffix) = Symbol(mangledvar(op), suffix, :_) @@ -69,7 +105,7 @@ function lower_conditionalstore_scalar!( varname = varassignname(var, u, parentisunrolled) condvarname = varassignname(condvar, u, condunrolled) td = UnrollArgs(u, unrolled, tiled, suffix) - push!(q.args, Expr(:&&, condvarname, Expr(:call, lv(STOREOP), ptr, varname, mem_offset_u(op, td)))) + push!(q.args, Expr(:&&, condvarname, Expr(:call, storeinstr(op), ptr, varname, mem_offset_u(op, td)))) end nothing end @@ -102,7 +138,7 @@ function lower_conditionalstore_vectorized!( td = UnrollArgs(u, unrolled, tiled, suffix) name, mo = name_memoffset(var, op, td, W, vecnotunrolled, parentisunrolled) condvarname = varassignname(condvar, u, condunrolled) - instrcall = Expr(:call, lv(STOREOP), ptr, name, mo) + instrcall = Expr(:call, storeinstr(op), ptr, name, mo) if mask !== nothing && (vecnotunrolled || u == U - 1) push!(instrcall.args, Expr(:call, :&, condvarname, mask)) else @@ -122,7 +158,7 @@ function lower_store_scalar!( for u ∈ 0:U-1 varname = varassignname(var, u, parentisunrolled) td = UnrollArgs(u, unrolled, tiled, suffix) - push!(q.args, Expr(:call, lv(STOREOP), ptr, varname, mem_offset_u(op, td))) + push!(q.args, Expr(:call, storeinstr(op), ptr, varname, mem_offset_u(op, td))) end nothing end @@ -146,7 +182,7 @@ function lower_store_vectorized!( for u ∈ umin:U-1 td = UnrollArgs(u, unrolled, tiled, suffix) name, mo = name_memoffset(var, op, td, W, vecnotunrolled, parentisunrolled) - instrcall = Expr(:call, lv(STOREOP), ptr, name, mo) + instrcall = Expr(:call, storeinstr(op), ptr, name, mo) if mask !== nothing && (vecnotunrolled || u == U - 1) push!(instrcall.args, mask) end diff --git a/src/lowering.jl b/src/lowering.jl index 5c7154042..50b09f526 100644 --- a/src/lowering.jl +++ b/src/lowering.jl @@ -408,3 +408,15 @@ end Base.convert(::Type{Expr}, ls::LoopSet) = lower(ls) Base.show(io::IO, ls::LoopSet) = println(io, lower(ls)) +function isunrolled_sym(op::Operation, unrolled::Symbol, ::Nothing) + unrolled ∈ loopdependencies(op) || (isconstant(op) && (unrolled ∈ reducedchildren(op))) +end +function isunrolled_sym(op::Operation, unrolled::Symbol, ::Int) + uild = unrolled ∈ loopdependencies(op) + (isconstant(op) & uild) && return true # ignore reduced children if unrolled + uild && unrolled ∉ reduceddependencies(op) +end + +isunrolled_sym(op::Operation, unrolled::Symbol, istiled::Bool) = istiled ? isunrolled_sym(op, unrolled, 0) : isunrolled_sym(op, unrolled, nothing) +isunrolled_sym(op::Operation, unrolled::Symbol, tiled::Symbol) = isunrolled_sym(op, unrolled, tiled ∈ loopdependencies(op)) + diff --git a/src/operation_evaluation_order.jl b/src/operation_evaluation_order.jl index 3bb0db39e..5876c7e90 100644 --- a/src/operation_evaluation_order.jl +++ b/src/operation_evaluation_order.jl @@ -18,7 +18,7 @@ function set_upstream_family!(adal::Vector{T}, op::Operation, val::T, ld::Vector{Symbol}, id::Int) where {T} adal[identifier(op)] == val && return # must already have been set - # ld != loopdependencies(op) && + # @show op if ld != loopdependencies(op) || id == identifier(op) (adal[identifier(op)] = val) end @@ -43,7 +43,9 @@ function addoptoorder!( istiled = (loopistiled ? (tiled ∈ loopdependencies(op)) : false) + 1 # optype = Int(op.node_type) + 1 after_loop = place_after_loop[id] + 1 + # @show place_after_loop[id], op isloopvalue(op) || push!(lo[isunrolled,istiled,after_loop,_n], op) + # all(opp -> iszero(length(reduceddependencies(opp))), parents(op)) && set_upstream_family!(place_after_loop, op, false, loopdependencies(op), identifier(op)) # parents that have already been included are not moved, so no need to check included_vars to filter nothing end @@ -54,7 +56,7 @@ function fillorder!(ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol, tiled: nloops = length(order) ops = operations(ls) nops = length(ops) - included_vars = fill!(resize!(ls.included_vars, nops), false) + included_vars = fill!(resize!(ls.included_vars, nops), false) place_after_loop = fill!(resize!(ls.place_after_loop, nops), true) # to go inside out, we just have to include all those not-yet included depending on the current sym empty!(lo) diff --git a/test/gemm.jl b/test/gemm.jl index 520fe0b0f..a353b8cd5 100644 --- a/test/gemm.jl +++ b/test/gemm.jl @@ -79,6 +79,14 @@ # end # end # end + # C = Cs; A = Ats'; B = Bs; factor = 1; + # ls = LoopVectorization.@avx_debug for m ∈ 1:size(A,1), n ∈ 1:size(B,2) + # ΔCₘₙ = zero(eltype(C)) + # for k ∈ 1:size(A,2) + # ΔCₘₙ += A[m,k] * B[k,n] + # end + # C[m,n] += ΔCₘₙ * factor + # end; function AmuladdBavx!(C, A, B, factor = 1) @avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2) ΔCₘₙ = zero(eltype(C)) @@ -486,6 +494,33 @@ end return C end + + function gemm_accurate!(C, A, B) + @avx for n in 1:size(C,2), m in 1:size(C,1) + Cmn_hi = zero(eltype(C)) + Cmn_lo = zero(eltype(C)) + for k in 1:size(B,1) + hiprod = evmul(A[m,k], B[k,n]) + loprod = vfmsub(A[m,k], B[k,n], hiprod) + hi_ts = evadd(hiprod, Cmn_hi) + a1_ts = evsub(hi_ts, Cmn_hi) + b1_ts = evsub(hi_ts, a1_ts) + lo_ts = evadd(evsub(hiprod, a1_ts), evsub(Cmn_hi, b1_ts)) + thi = evadd(loprod, Cmn_lo) + a1_t = evsub(thi, Cmn_lo) + b1_t = evsub(thi, a1_t) + tlo = evadd(evsub(loprod, a1_t), evsub(Cmn_lo, b1_t)) + c1 = evadd(lo_ts, thi) + hi_ths = evadd(hi_ts, c1) + lo_ths = evsub(c1, evsub(hi_ths, hi_ts)) + c2 = evadd(tlo, lo_ths) + Cmn_hi = evadd(hi_ths, c2) + Cmn_lo = evsub(c2, evsub(Cmn_hi, hi_ths)) + end + C[m,n] = Cmn_hi + end + end + # M = 77; # A = rand(M,M); B = rand(M,M); C = similar(A); # mulCAtB_2x2block_avx!(C,A,B) @@ -583,6 +618,14 @@ @test C ≈ C2 fill!(C, 9999.999); mulCAtB_2x2blockavx_noinline!(C, A', B); @test C ≈ C2 + fill!(C, 9999.999); gemm_accurate!(C, A, B); + @test C ≈ C2 + fill!(C, 9999.999); gemm_accurate!(C, At', B); + @test C ≈ C2 + fill!(C, 9999.999); gemm_accurate!(C, A, Bt'); + @test C ≈ C2 + fill!(C, 9999.999); gemm_accurate!(C, At', Bt'); + @test C ≈ C2 end @time @testset "_avx $T dynamic gemm" begin AmulB_avx1!(C, A, B) diff --git a/test/gemv.jl b/test/gemv.jl index 4b9982496..5f769c840 100644 --- a/test/gemv.jl +++ b/test/gemv.jl @@ -91,7 +91,7 @@ using Test end function AtmulvBavx3!(G, B,κ) d = size(G,1) - @avx for d1=1:d + @avx tile=(2,2) for d1=1:d G[d1,κ] = B[1,d1]*B[1,κ] for d2=2:d G[d1,κ] += B[d2,d1]*B[d2,κ] @@ -160,7 +160,7 @@ using Test @test y1 ≈ y2 fill!(y2, -999.9); mygemv_avx!(y2, A, x) @test y1 ≈ y2 - fill!(y2, -999.9) + fill!(y2, -999.9); mygemvavx_range!(y2, A, x) @test y1 ≈ y2 diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl index d1f4c07e0..606ca34a7 100644 --- a/test/miscellaneous.jl +++ b/test/miscellaneous.jl @@ -184,7 +184,7 @@ end p += pn end) - # lsb = LoopVectorization.LoopSet(bq); + lsb = LoopVectorization.LoopSet(bq); function clenshaw!(ret,x,coeff) @inbounds for j in 1:length(ret) diff --git a/test/offsetarrays.jl b/test/offsetarrays.jl index a32b3d0ac..acdd98805 100644 --- a/test/offsetarrays.jl +++ b/test/offsetarrays.jl @@ -1,12 +1,12 @@ using LoopVectorization, OffsetArrays using LoopVectorization.VectorizationBase: StaticUnitRange using Test -# T = Float64 +T = Float64 # T = Float32 @testset "OffsetArrays" begin - function old2d!(out::AbstractMatrix, A::AbstractMatrix, kern) + function old2d!(out::AbstractMatrix, A::AbstractMatrix, kern) rng1k, rng2k = axes(kern) rng1, rng2 = axes(out) for j in rng2, i in rng1 @@ -48,15 +48,15 @@ using Test # end); # lsq2d = LoopVectorization.LoopSet(q2d); LoopVectorization.choose_order(lsq2d) - # oq2 = :(for j in rng2, i in rng1 - # tmp = zero(eltype(out)) - # for jk in -1:1, ik in -1:1 - # tmp += A[i+ik,j+jk]*kern[ik,jk] - # end - # out[i,j] = tmp - # end); - # lsoq = LoopVectorization.LoopSet(oq2); - # LoopVectorization.choose_order(lsoq) + oq2 = :(for j in rng2, i in rng1 + tmp = zero(eltype(out)) + for jk in -1:1, ik in -1:1 + tmp += A[i+ik,j+jk]*kern[ik,jk] + end + out[i,j] = tmp + end); + lsoq = LoopVectorization.LoopSet(oq2); + LoopVectorization.choose_order(lsoq) function avx2d!(out::AbstractMatrix, A::AbstractMatrix, kern) rng1k, rng2k = axes(kern) diff --git a/test/printmethods.jl b/test/printmethods.jl index 6333187a1..0673b8af2 100644 --- a/test/printmethods.jl +++ b/test/printmethods.jl @@ -25,5 +25,5 @@ @test occursin(" = A[m, k]", s) @test occursin(" = B[k, n]", s) @test occursin(" = LoopVectorization.vfmadd", s) - @test occursin(" = LoopVectorization.reduce_to_add", s) + # @test occursin(" = LoopVectorization.reduce_to_add", s) end