diff --git a/Manifest.toml b/Manifest.toml index 8f50b337e..7557d0168 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -42,9 +42,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[SIMDPirates]] deps = ["VectorizationBase"] -git-tree-sha1 = "1a9cbe1be1f5d43ac49eeb38ca64dd78e50a0cc6" +git-tree-sha1 = "8f89aa38f5e4e89f2a474ffdc850fc21d6ab9ed4" uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a" -version = "0.7.3" +version = "0.7.4" [[SLEEFPirates]] deps = ["Libdl", "SIMDPirates", "VectorizationBase"] @@ -69,6 +69,6 @@ version = "0.1.0" [[VectorizationBase]] deps = ["CpuId", "LinearAlgebra"] -git-tree-sha1 = "83e32d4835fc4f4ecfd43eb59fa7fc00854b3d41" +git-tree-sha1 = "2a83ab02d3fb6ad5de0c6d04103b0ca403d9a7d8" uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" -version = "0.9.4" +version = "0.9.5" diff --git a/Project.toml b/Project.toml index 532741bb7..b26f9a2c4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LoopVectorization" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" authors = ["Chris Elrod "] -version = "0.6.23" +version = "0.6.24" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -13,10 +13,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [compat] OffsetArrays = "1" -SIMDPirates = "0.7.3" +SIMDPirates = "0.7.4" SLEEFPirates = "0.4" UnPack = "0" -VectorizationBase = "0.9.3" +VectorizationBase = "0.9.5" julia = "1.1" [extras] diff --git a/docs/src/assets/bench_AmulB_v1.svg b/docs/src/assets/bench_AmulB_v1.svg index 6b7362732..a9cf7b8d9 100644 --- a/docs/src/assets/bench_AmulB_v1.svg +++ b/docs/src/assets/bench_AmulB_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120130GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120130GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_AmulBt_v1.svg b/docs/src/assets/bench_AmulBt_v1.svg index 907aa5a16..7449fc426 100644 --- a/docs/src/assets/bench_AmulBt_v1.svg +++ b/docs/src/assets/bench_AmulBt_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120130GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120130GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_Amulvb_v1.svg b/docs/src/assets/bench_Amulvb_v1.svg index 032e45699..536337d3c 100644 --- a/docs/src/assets/bench_Amulvb_v1.svg +++ b/docs/src/assets/bench_Amulvb_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size051015202530354045505560657075808590GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_AplusAt_v1.svg b/docs/src/assets/bench_AplusAt_v1.svg index 162bca272..f75c1090f 100644 --- a/docs/src/assets/bench_AplusAt_v1.svg +++ b/docs/src/assets/bench_AplusAt_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.51.01.52.02.53.03.54.04.55.0GFLOPSClang-PollyGFortranGFortran-builtinJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-builtinMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.51.01.52.02.53.03.54.04.55.0GFLOPSClang-PollyGFortranGFortran-builtinJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-builtinMethod diff --git a/docs/src/assets/bench_AtmulB_v1.svg b/docs/src/assets/bench_AtmulB_v1.svg index 440850e82..7601dc632 100644 --- a/docs/src/assets/bench_AtmulB_v1.svg +++ b/docs/src/assets/bench_AtmulB_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_AtmulBt_v1.svg b/docs/src/assets/bench_AtmulBt_v1.svg index 998a1f6d9..9a317fb6d 100644 --- a/docs/src/assets/bench_AtmulBt_v1.svg +++ b/docs/src/assets/bench_AtmulBt_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0102030405060708090100110120GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_Atmulvb_v1.svg b/docs/src/assets/bench_Atmulvb_v1.svg index 18dc5c72c..56545f80e 100644 --- a/docs/src/assets/bench_Atmulvb_v1.svg +++ b/docs/src/assets/bench_Atmulvb_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size02468101214161820222426283032343638404244GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550GFLOPSClang-PollyGFort-intrinsicGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortifort-intrinsicMethod diff --git a/docs/src/assets/bench_aplusBc_v1.svg b/docs/src/assets/bench_aplusBc_v1.svg index 0d2826546..2bf62ba97 100644 --- a/docs/src/assets/bench_aplusBc_v1.svg +++ b/docs/src/assets/bench_aplusBc_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size024681012141618202224262830GFLOPSClang-PollyGFortranJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size024681012141618202224262830GFLOPSClang-PollyGFortranJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/docs/src/assets/bench_dot3_v1.svg b/docs/src/assets/bench_dot3_v1.svg index 7cd086e7f..0b7509c98 100644 --- a/docs/src/assets/bench_dot3_v1.svg +++ b/docs/src/assets/bench_dot3_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0510152025303540455055606570GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0510152025303540455055606570GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/docs/src/assets/bench_dot_v1.svg b/docs/src/assets/bench_dot_v1.svg index 8671b2ea5..fe87d26ff 100644 --- a/docs/src/assets/bench_dot_v1.svg +++ b/docs/src/assets/bench_dot_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size02468101214161820222426283032343638404244GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size02468101214161820222426283032343638404244GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/docs/src/assets/bench_exp_v1.svg b/docs/src/assets/bench_exp_v1.svg index 50c15b320..0f5284a6f 100644 --- a/docs/src/assets/bench_exp_v1.svg +++ b/docs/src/assets/bench_exp_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.20.40.60.81.01.21.41.61.82.02.22.42.6GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.20.40.60.81.01.21.41.61.82.02.22.42.6GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_filter2d_3x3_v1.svg b/docs/src/assets/bench_filter2d_3x3_v1.svg index 54c9ce0bd..0615ac2f9 100644 --- a/docs/src/assets/bench_filter2d_3x3_v1.svg +++ b/docs/src/assets/bench_filter2d_3x3_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_filter2d_dynamic_v1.svg b/docs/src/assets/bench_filter2d_dynamic_v1.svg index 5998eb96d..8684ca309 100644 --- a/docs/src/assets/bench_filter2d_dynamic_v1.svg +++ b/docs/src/assets/bench_filter2d_dynamic_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0510152025303540455055GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size051015202530354045505560GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_filter2d_unrolled_v1.svg b/docs/src/assets/bench_filter2d_unrolled_v1.svg index 44a23a19f..7149680ba 100644 --- a/docs/src/assets/bench_filter2d_unrolled_v1.svg +++ b/docs/src/assets/bench_filter2d_unrolled_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_logdettriangle_v1.svg b/docs/src/assets/bench_logdettriangle_v1.svg index bd9672bff..ba6c061d5 100644 --- a/docs/src/assets/bench_logdettriangle_v1.svg +++ b/docs/src/assets/bench_logdettriangle_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.10.20.30.40.50.60.70.80.91.01.11.21.3GFLOPSClang-PollyGFortranJuliaJulia-builtinLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.10.20.30.40.50.60.70.80.91.01.11.21.3GFLOPSClang-PollyGFortranJuliaJulia-builtinLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_random_access_v1.svg b/docs/src/assets/bench_random_access_v1.svg index d64e3e78e..09da8c145 100644 --- a/docs/src/assets/bench_random_access_v1.svg +++ b/docs/src/assets/bench_random_access_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.20.40.60.81.01.21.41.61.82.02.22.42.62.83.03.23.4GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0.00.20.40.60.81.01.21.41.61.82.02.22.42.62.83.03.23.43.63.84.0GFLOPSClang-PollyGFortranJuliaLoopVectorizationiccifortMethod diff --git a/docs/src/assets/bench_selfdot_v1.svg b/docs/src/assets/bench_selfdot_v1.svg index 5939989b8..909d0dbd1 100644 --- a/docs/src/assets/bench_selfdot_v1.svg +++ b/docs/src/assets/bench_selfdot_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size051015202530354045505560GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size0510152025303540455055GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/docs/src/assets/bench_sse_v1.svg b/docs/src/assets/bench_sse_v1.svg index 477351027..fc61824c7 100644 --- a/docs/src/assets/bench_sse_v1.svg +++ b/docs/src/assets/bench_sse_v1.svg @@ -1,3 +1,3 @@ -0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod +0102030405060708090100110120130140150160170180190200210220230240250260Size05101520253035404550556065707580GFLOPSClang-PollyGFortranIntelMKLJuliaLoopVectorizationg++ & Eigen-3iccicpc & Eigen-3ifortMethod diff --git a/src/add_compute.jl b/src/add_compute.jl index 56ac30dbc..95a2291f7 100644 --- a/src/add_compute.jl +++ b/src/add_compute.jl @@ -33,7 +33,7 @@ function setdiffv!(s4::AbstractVector{T}, s3::AbstractVector{T}, s1::AbstractVec end end function update_deps!(deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, parent::Operation) - mergesetv!(deps, loopdependencies(parent))#, reduceddependencies(parent)) + mergesetv!(deps, loopdependencies(parent))#, reduceddependencies(parent)) if !(isload(parent) || isconstant(parent)) && !isreductcombineinstr(parent) mergesetv!(reduceddeps, reduceddependencies(parent)) end @@ -101,6 +101,18 @@ function isreductzero(op::Operation, ls::LoopSet, reduct_zero::Symbol) false end +# function substitute_op_in_parents!(vparents::Vector{Operation}, replacer::Operation, replacee::Operation) +# for i ∈ eachindex(vparents) +# opp = vparents[i] +# if opp === replacee +# vparents[i] = replacer +# else +# substitute_op_in_parents!(parents(opp), replacer, replacee) +# end +# end +# end + + function add_reduction_update_parent!( vparents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, parent::Operation, instr::Symbol, directdependency::Bool, elementbytes::Int @@ -109,11 +121,14 @@ function add_reduction_update_parent!( isouterreduction = parent.instruction === LOOPCONSTANT Instr = instruction(ls, instr) instrclass = reduction_instruction_class(Instr) # key allows for faster lookups + reduct_zero = reduction_zero(instrclass) # if parent is not an outer reduction... - if !isouterreduction - # and parent is not a reduction_zero - reduct_zero = reduction_zero(instrclass) + # if !isouterreduction && !isreductzero(parent, ls, reduct_zero) + add_reduct_instruct = !isouterreduction && !isconstant(parent) + if add_reduct_instruct + # We add reductcombine = reduction_scalar_combine(instrclass) + # reductcombine = :identity reductsym = gensym(:reduction) reductinit = add_constant!(ls, gensym(:reductzero), loopdependencies(parent), reductsym, elementbytes, :numericconstant) if reduct_zero === :zero @@ -128,13 +143,13 @@ function add_reduction_update_parent!( end pushpreamble!(ls, op, name, reductinit) end - if isreductzero(parent, ls, reduct_zero) - reductcombine = reduction_combine_to(instrclass) - end + # if + # reductcombine = reduction_combine_to(instrclass) + # end else reductinit = parent reductsym = var - reductcombine = Symbol("") + reductcombine = :identity#Symbol("") end combineddeps = copy(deps); mergesetv!(combineddeps, reduceddeps) # directdependency && pushparent!(vparents, deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint @@ -145,6 +160,8 @@ function add_reduction_update_parent!( else push!(vparents, reductinit) end + # elseif !isouterreduction + # substitute_op_in_parents!(vparents, reductinit, parent) end update_reduction_status!(vparents, reduceddeps, name(reductinit)) # this is the op added by add_compute @@ -152,9 +169,11 @@ function add_reduction_update_parent!( parent.instruction === LOOPCONSTANT && push!(ls.outer_reductions, identifier(op)) opout = pushop!(ls, op, var) # note this overwrites the entry in the operations dict, but not the vector # isouterreduction || iszero(length(reduceddeps)) && return opout + # return opout isouterreduction && return opout # create child op, which is the reduction combination - childrdeps = Symbol[]; childparents = Operation[ op, parent ] + childrdeps = Symbol[]; childparents = Operation[ op ]#, parent ] + add_reduct_instruct && push!(childparents, parent) childdeps = loopdependencies(reductinit) setdiffv!(childrdeps, loopdependencies(op), childdeps) child = Operation( diff --git a/src/costs.jl b/src/costs.jl index ee4fa28c8..1a2bd9188 100644 --- a/src/costs.jl +++ b/src/costs.jl @@ -198,9 +198,9 @@ const COST = Dict{Instruction,InstructionCost}( Instruction(:adjoint) => InstructionCost(0,0.0,0.0,0), Instruction(:transpose) => InstructionCost(0,0.0,0.0,0), Instruction(:prefetch) => InstructionCost(0,0.0,0.0,0), + Instruction(:prefetch0) => InstructionCost(0,0.0,0.0,0), Instruction(:prefetch1) => InstructionCost(0,0.0,0.0,0), - Instruction(:prefetch2) => InstructionCost(0,0.0,0.0,0), - Instruction(:prefetch3) => InstructionCost(0,0.0,0.0,0) + Instruction(:prefetch2) => InstructionCost(0,0.0,0.0,0) ) @inline prefetch0(x, i) = SIMDPirates.prefetch(gep(stridedpointer(x), (extract_data(i) - 1,)), Val{3}(), Val{0}()) @inline prefetch0(x, i, j) = SIMDPirates.prefetch(gep(stridedpointer(x), (extract_data(i) - 1, extract_data(j) - 1)), Val{3}(), Val{0}()) @@ -274,7 +274,7 @@ reduction_instruction_class(instr::Symbol) = get(REDUCTION_CLASS, instr, NaN) reduction_instruction_class(instr::Instruction) = reduction_instruction_class(instr.instr) function reduction_to_single_vector(x::Float64) # x == 1.0 ? :evadd : x == 2.0 ? :evmul : x == 3.0 ? :vor : x == 4.0 ? :vand : x == 5.0 ? :max : x == 6.0 ? :min : throw("Reduction not found.") - x == 1.0 ? :evadd : x == 2.0 ? :evmul : x == 5.0 ? :max : x == 6.0 ? :min : throw("Reduction not found.") + x == ADDITIVE_IN_REDUCTIONS ? :evadd : x == MULTIPLICATIVE_IN_REDUCTIONS ? :evmul : x == MAX ? :max : x == MIN ? :min : throw("Reduction not found.") end reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction_class(x)) # function reduction_to_scalar(x::Float64) @@ -284,17 +284,17 @@ reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction # reduction_to_scalar(x) = reduction_to_scalar(reduction_instruction_class(x)) function reduction_scalar_combine(x::Float64) # x == 1.0 ? :reduced_add : x == 2.0 ? :reduced_prod : x == 3.0 ? :reduced_any : x == 4.0 ? :reduced_all : x == 5.0 ? :reduced_max : x == 6.0 ? :reduced_min : throw("Reduction not found.") - x == 1.0 ? :reduced_add : x == 2.0 ? :reduced_prod : x == 5.0 ? :reduced_max : x == 6.0 ? :reduced_min : throw("Reduction not found.") + x == ADDITIVE_IN_REDUCTIONS ? :reduced_add : x == MULTIPLICATIVE_IN_REDUCTIONS ? :reduced_prod : x == MAX ? :reduced_max : x == MIN ? :reduced_min : throw("Reduction not found.") end reduction_scalar_combine(x) = reduction_scalar_combine(reduction_instruction_class(x)) function reduction_combine_to(x::Float64) # x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 3.0 ? :reduce_to_any : x == 4.0 ? :reduce_to_all : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.") - x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.") + x == ADDITIVE_IN_REDUCTIONS ? :reduce_to_add : x == MULTIPLICATIVE_IN_REDUCTIONS ? :reduce_to_prod : x == MAX ? :reduce_to_max : x == MIN ? :reduce_to_min : throw("Reduction not found.") end reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x)) function reduction_zero(x::Float64) # x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.") - x == 1.0 ? :zero : x == 2.0 ? :one : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.") + x == ADDITIVE_IN_REDUCTIONS ? :zero : x == MULTIPLICATIVE_IN_REDUCTIONS ? :one : x == MAX ? :typemin : x == MIN ? :typemax : throw("Reduction not found.") end reduction_zero(x) = reduction_zero(reduction_instruction_class(x)) diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl index ed535ee52..bfc26f949 100644 --- a/src/determinestrategy.jl +++ b/src/determinestrategy.jl @@ -50,18 +50,18 @@ function register_pressure(op::Operation) end end function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int = op.elementbytes) - isconstant(op) && return 0.0, 0, Int(length(loopdependencies(op)) > 0) - isloopvalue(op) && return 0.0, 0, 0 + isconstant(op) && return 0.0, 0, Float64(length(loopdependencies(op)) > 0) + isloopvalue(op) && return 0.0, 0, 0.0 # Wshift == dependson(op, vectorized) ? Wshift : 0 # c = first(cost(instruction(op), Wshift, size_T))::Int instr = Instruction(:LoopVectorization, instruction(op).instr) # instr = instruction(op) if length(parents(op)) == 1 if instr == Instruction(:-) || instr === Instruction(:vsub) || instr == Instruction(:+) || instr == Instruction(:vadd) - return 0.0, 0, 0 + return 0.0, 0, 0.0 end elseif iscompute(op) && all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op)) - return 0.0, 0, 0 + return 0.0, 0, 0.0 end opisvectorized = dependson(op, vectorized) srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr) @@ -80,7 +80,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_ sl *= 3 end end - srt, sl, srp + srt, sl, Float64(srp) end # Base._return_type() @@ -336,7 +336,7 @@ function maybedemotesize(U::Int, N::Int) um1rep > urep ? U : Um1 end function maybedemotesize(T::Int, N::Int, U::Int, Uloop::Loop, maxTbase::Int) - T > 1 || return T + T > 1 || return 1 T == N && return T T = maybedemotesize(T, N) if !(isstaticloop(Uloop) && length(Uloop) == U) @@ -451,11 +451,11 @@ function convolution_cost_factor(ls::LoopSet, op::Operation, u1::Symbol, u2::Sym if isstaticloop(loop) && length(loop) ≤ 4 itersym = loop.itersymbol if itersym !== u1 && itersym !== u2 - return (0.25, 1.0) + return (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 2.0 : 1.0) end end end - (0.25, 0.5) + (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 1.2 : 1.0) else (1.0, 1.0) end @@ -529,7 +529,7 @@ function evaluate_cost_tile( factor1, factor2 = convolution_cost_factor(ls, op, unrolled, tiled, vectorized) rt *= factor1; rp *= factor2; end - # @show op rt, lat, rp + # @show isunrolled, istiled, op rt, lat, rp rp = opisininnerloop ? rp : 0 # we only care about register pressure within the inner most loop rt *= iters[id] if isunrolled && istiled # no cost decrease; cost must be repeated @@ -546,6 +546,7 @@ function evaluate_cost_tile( reg_pressure[4] += rp end end + # @show reg_pressure costpenalty = (sum(reg_pressure) > VectorizationBase.REGISTER_COUNT) ? 2 : 1 # @show order, vectorized cost_vec reg_pressure # @show solve_tilesize(ls, unrolled, tiled, cost_vec, reg_pressure) @@ -665,12 +666,15 @@ function choose_order(ls::LoopSet) end end -function register_pressure(ls::LoopSet) - order, unroll, vec, U, T = choose_order(ls) +function register_pressure(ls::LoopSet, U, T) if T == -1 sum(register_pressure, operations(ls)) else - rp = @view ls.reg_pressure[:,1] - tU * tT * rp[1] + tU * rp[2] + rp[3] + rp[4] + rp = @view ls.reg_pres[:,1] + U * T * rp[1] + U * rp[2] + rp[3] + rp[4] end end +function register_pressure(ls::LoopSet) + order, unroll, tile, vec, U, T = choose_order(ls) + register_pressure(ls, U, T) +end diff --git a/src/graphs.jl b/src/graphs.jl index be02ce3d6..8146b991d 100644 --- a/src/graphs.jl +++ b/src/graphs.jl @@ -1,41 +1,4 @@ -# """ -# ShortVector{T} simply wraps a Vector{T}, but uses a different hash function that is faster for short vectors to support using it as the keys of a Dict. -# This hash function scales O(N) with length of the vectors, so it is slow for long vectors. -# """ -# struct ShortVector{T} <: DenseVector{T} -# data::Vector{T} -# end -# Base.@propagate_inbounds Base.getindex(x::ShortVector, I...) = x.data[I...] -# Base.@propagate_inbounds Base.setindex!(x::ShortVector, v, I...) = x.data[I...] = v -# ShortVector{T}(::UndefInitializer, N::Integer) where {T} = ShortVector{T}(Vector{T}(undef, N)) -# @inbounds Base.length(x::ShortVector) = length(x.data) -# @inbounds Base.size(x::ShortVector) = size(x.data) -# @inbounds Base.strides(x::ShortVector) = strides(x.data) -# @inbounds Base.push!(x::ShortVector, v) = push!(x.data, v) -# @inbounds Base.append!(x::ShortVector, v) = append!(x.data, v) -# function Base.hash(x::ShortVector, h::UInt) -# @inbounds for n ∈ eachindex(x) -# h = hash(x[n], h) -# end -# h -# end -# function Base.isequal(a::ShortVector{T}, b::ShortVector{T}) where {T} -# length(a) == length(b) || return false -# @inbounds for i ∈ 1:length(a) -# a[i] === b[i] || return false -# end -# true -# end -# Base.convert(::Type{Vector}, sv::ShortVector) = sv.data -# Base.convert(::Type{Vector{T}}, sv::ShortVector{T}) where {T} = sv.data - - -# For passing options like array types and mask -# struct LoopSetOptions - -# end - struct UnrollSpecification unrolledloopnum::Int tiledloopnum::Int diff --git a/src/lower_compute.jl b/src/lower_compute.jl index 48f6655f0..796db5877 100644 --- a/src/lower_compute.jl +++ b/src/lower_compute.jl @@ -8,9 +8,10 @@ function lower_compute!( opunrolled = unrolled ∈ loopdependencies(op) ) - var = op.variable - mvar = mangledvar(op) + var = name(op) + instr = instruction(op) parents_op = parents(op) + mvar = mangledvar(op) nparents = length(parents_op) parentstiled = if suffix === nothing optiled = false @@ -25,13 +26,20 @@ function lower_compute!( optiled = true [tiled ∈ loopdependencies(opp) for opp ∈ parents_op] end - parentsunrolled = [unrolled ∈ loopdependencies(opp) || unrolled ∈ reducedchildren(opp) for opp ∈ parents_op] + parentsunrolled = isunrolled_sym.(parents_op, unrolled, tiled) + if instr.instr === :identity && name(first(parents_op)) === var && isone(length(parents_op)) + if (opunrolled == first(parentsunrolled)) && ((!isnothing(suffix)) == first(parentstiled)) + return + end + end + unrollsym = isunrolled_sym(op, unrolled, suffix) if !opunrolled && any(parentsunrolled) parents_op = copy(parents_op) for i ∈ eachindex(parentsunrolled) parentsunrolled[i] || continue parentsunrolled[i] = false parentop = parents_op[i] + # @show op, parentop newparentop = Operation( parentop.identifier, gensym(parentop.variable), parentop.elementbytes, parentop.instruction, parentop.node_type, parentop.dependencies, parentop.reduced_deps, parentop.parents, parentop.ref, parentop.reduced_children @@ -43,14 +51,18 @@ function lower_compute!( parentname = Symbol(parentname, suffix_) newparentname = Symbol(newparentname, suffix_) end - for u ∈ 0:U-1 - push!(q.args, Expr(:(=), Symbol(newparentname, u), Symbol(parentname, u))) + if isconstant(newparentop) + push!(q.args, Expr(:(=), newparentname, Symbol(parentname, 0))) + continue + else + for u ∈ 0:U-1 + push!(q.args, Expr(:(=), Symbol(newparentname, u), Symbol(parentname, u))) + end + reduce_expr!(q, newparentname, Instruction(reduction_to_single_vector(instruction(newparentop))), U) + push!(q.args, Expr(:(=), newparentname, Symbol(newparentname, 0))) end - reduce_expr!(q, newparentname, Instruction(reduction_to_single_vector(instruction(newparentop))), U) - push!(q.args, Expr(:(=), newparentname, Symbol(newparentname, 0))) end end - instr = op.instruction # cache unroll and tiling check of parents # not broadcasted, because we use frequent checks of individual bools # making BitArrays inefficient. @@ -84,7 +96,7 @@ function lower_compute!( # modsuffix = ((u + suffix*U) & 3) modsuffix = (suffix & 3) Symbol(mvar, modsuffix) - elseif opunrolled + elseif unrollsym Symbol(mvar, u) else mvar @@ -125,7 +137,11 @@ function lower_compute!( continue end end - push!(q.args, Expr(:(=), varsym, instrcall)) + if instr.instr === :identity && isone(length(parents_op)) + push!(q.args, Expr(:(=), varsym, instrcall.args[2])) + else + push!(q.args, Expr(:(=), varsym, instrcall)) + end end end diff --git a/src/lower_constant.jl b/src/lower_constant.jl index c9e6f5c82..b4fe77919 100644 --- a/src/lower_constant.jl +++ b/src/lower_constant.jl @@ -24,15 +24,27 @@ function lower_zero!( else call = Expr(:call, :zero, typeT) end - if unrolled ∈ loopdependencies(op) || unrolled ∈ reducedchildren(op) || unrolled ∈ reduceddependencies(op) + if isunrolled_sym(op, unrolled, suffix) + broadcastsym = Symbol(mvar, "_#init#") + push!(q.args, Expr(:(=), broadcastsym, call)) for u ∈ 0:U-1 - push!(q.args, Expr(:(=), Symbol(mvar, u), call)) + push!(q.args, Expr(:(=), Symbol(mvar, u), broadcastsym)) end else push!(q.args, Expr(:(=), mvar, call)) end nothing end +# Have to awkwardly search through `operations(ls)` to try and find op's child +function getparentsreductzero(ls::LoopSet, op::Operation)::Float64 + opname = name(op) + for opp ∈ operations(ls) + if name(opp) === opname && opp !== op && iscompute(opp) && search_tree(parents(opp), opname) && length(reduceddependencies(opp)) > 0 + return reduction_instruction_class(instruction(opp)) + end + end + throw("Reduct zero not found.") +end function lower_constant!( q::Expr, op::Operation, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, U::Int, suffix::Union{Nothing,Int} ) @@ -40,24 +52,37 @@ function lower_constant!( instruction = op.instruction mvar = variable_name(op, suffix) constsym = instruction.instr - if vectorized ∈ loopdependencies(op) || vectorized ∈ reducedchildren(op) || vectorized ∈ reduceddependencies(op) + reducedchildvectorized = vectorized ∈ reducedchildren(op) + unroll = isunrolled_sym(op, unrolled, suffix) + if reducedchildvectorized || vectorized ∈ loopdependencies(op) || vectorized ∈ reduceddependencies(op) # call = Expr(:call, lv(:vbroadcast), W, Expr(:call, lv(:maybeconvert), typeT, constsym)) - call = Expr(:call, lv(:vbroadcast), W, constsym) - if unrolled ∈ loopdependencies(op) || unrolled ∈ reducedchildren(op) || unrolled ∈ reduceddependencies(op) - for u ∈ 0:U-1 - push!(q.args, Expr(:(=), Symbol(mvar, u), call)) + call = if reducedchildvectorized && vectorized ∉ loopdependencies(op) + instrclass = getparentsreductzero(ls, op) + if instrclass == ADDITIVE_IN_REDUCTIONS + Expr(:call, Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:SIMDPirates)), QuoteNode(:addscalar)), Expr(:call, lv(:vzero), W, typeT), constsym) + elseif instrclass == MULTIPLICATIVE_IN_REDUCTIONS + Expr(:call, Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:SIMDPirates)), QuoteNode(:mulscalar)), Expr(:call, lv(:vbroadcast), W, Expr(:call, :one, typeT)), constsym) + else + throw("Reductions of type $(reduction_zero(reinstrclass)) not yet supported; please file an issue as a reminder to take care of this.") end else - push!(q.args, Expr(:(=), mvar, call)) + Expr(:call, lv(:vbroadcast), W, constsym) end - else - if unrolled ∈ loopdependencies(op) || unrolled ∈ reducedchildren(op) || unrolled ∈ reduceddependencies(op) + if unroll + broadcastsym = Symbol(mvar, "_#init#") + push!(q.args, Expr(:(=), broadcastsym, call)) for u ∈ 0:U-1 - push!(q.args, Expr(:(=), Symbol(mvar, u), constsym)) + push!(q.args, Expr(:(=), Symbol(mvar, u), broadcastsym)) end else - push!(q.args, Expr(:(=), mvar, constsym)) + push!(q.args, Expr(:(=), mvar, call)) end + elseif unroll + for u ∈ 0:U-1 + push!(q.args, Expr(:(=), Symbol(mvar, u), constsym)) + end + else + push!(q.args, Expr(:(=), mvar, constsym)) end nothing end diff --git a/src/lower_store.jl b/src/lower_store.jl index 43d555c4b..4e385d94c 100644 --- a/src/lower_store.jl +++ b/src/lower_store.jl @@ -1,5 +1,41 @@ using VectorizationBase: vnoaliasstore! -const STOREOP = :vnoaliasstore! +# const STOREOP = :vnoaliasstore! + +@inline vstoreadditivereduce!(args...) = vnoaliasstore!(args...) +@inline vstoremultiplicativevereduce!(args...) = vnoaliasstore!(args...) +@inline function vstoreadditivereduce!(ptr::VectorizationBase.AbstractStridedPointer, v::VectorizationBase.SVec, i::NTuple{N,<:Integer}) where {N} + vnoaliasstore!(ptr, SIMDPirates.vsum(v), i) +end +@inline function vstoreadditivereduce!(ptr::VectorizationBase.AbstractStridedPointer, v::VectorizationBase.SVec, i::NTuple{N,<:Integer}, m::VectorizationBase.Mask) where {N} + vnoaliasstore!(ptr, SIMDPirates.vsum(v), i, m) +end +@inline function vstoremultiplicativevereduce!(ptr::VectorizationBase.AbstractStridedPointer, v::VectorizationBase.SVec, i::NTuple{N,<:Integer}) where {N} + vnoaliasstore!(ptr, SIMDPirates.vprod(v), i) +end +@inline function vstoremultiplicativevereduce!(ptr::VectorizationBase.AbstractStridedPointer, v::VectorizationBase.SVec, i::NTuple{N,<:Integer}, m::VectorizationBase.Mask) where {N} + vnoaliasstore!(ptr, SIMDPirates.vprod(v), i, m) +end + +function storeinstr(op::Operation) + opp = first(parents(op)) + if instruction(opp).instr === :identity + opp = first(parents(opp)) + end + instr = if iszero(length(reduceddependencies(opp))) + :vnoaliasstore! + else + instr_class = reduction_instruction_class(instruction(opp)) + if instr_class === ADDITIVE_IN_REDUCTIONS + :vstoreadditivereduce! + elseif instr_class === MULTIPLICATIVE_IN_REDUCTIONS + :vstoremultiplicativevereduce! + else #FIXME + :vnoaliasstore! + end + end + lv(instr) +end + # const STOREOP = :vstore! variable_name(op::Operation, ::Nothing) = mangledvar(op) variable_name(op::Operation, suffix) = Symbol(mangledvar(op), suffix, :_) @@ -69,7 +105,7 @@ function lower_conditionalstore_scalar!( varname = varassignname(var, u, parentisunrolled) condvarname = varassignname(condvar, u, condunrolled) td = UnrollArgs(u, unrolled, tiled, suffix) - push!(q.args, Expr(:&&, condvarname, Expr(:call, lv(STOREOP), ptr, varname, mem_offset_u(op, td)))) + push!(q.args, Expr(:&&, condvarname, Expr(:call, storeinstr(op), ptr, varname, mem_offset_u(op, td)))) end nothing end @@ -102,7 +138,7 @@ function lower_conditionalstore_vectorized!( td = UnrollArgs(u, unrolled, tiled, suffix) name, mo = name_memoffset(var, op, td, W, vecnotunrolled, parentisunrolled) condvarname = varassignname(condvar, u, condunrolled) - instrcall = Expr(:call, lv(STOREOP), ptr, name, mo) + instrcall = Expr(:call, storeinstr(op), ptr, name, mo) if mask !== nothing && (vecnotunrolled || u == U - 1) push!(instrcall.args, Expr(:call, :&, condvarname, mask)) else @@ -122,7 +158,7 @@ function lower_store_scalar!( for u ∈ 0:U-1 varname = varassignname(var, u, parentisunrolled) td = UnrollArgs(u, unrolled, tiled, suffix) - push!(q.args, Expr(:call, lv(STOREOP), ptr, varname, mem_offset_u(op, td))) + push!(q.args, Expr(:call, storeinstr(op), ptr, varname, mem_offset_u(op, td))) end nothing end @@ -146,7 +182,7 @@ function lower_store_vectorized!( for u ∈ umin:U-1 td = UnrollArgs(u, unrolled, tiled, suffix) name, mo = name_memoffset(var, op, td, W, vecnotunrolled, parentisunrolled) - instrcall = Expr(:call, lv(STOREOP), ptr, name, mo) + instrcall = Expr(:call, storeinstr(op), ptr, name, mo) if mask !== nothing && (vecnotunrolled || u == U - 1) push!(instrcall.args, mask) end diff --git a/src/lowering.jl b/src/lowering.jl index 5c7154042..50b09f526 100644 --- a/src/lowering.jl +++ b/src/lowering.jl @@ -408,3 +408,15 @@ end Base.convert(::Type{Expr}, ls::LoopSet) = lower(ls) Base.show(io::IO, ls::LoopSet) = println(io, lower(ls)) +function isunrolled_sym(op::Operation, unrolled::Symbol, ::Nothing) + unrolled ∈ loopdependencies(op) || (isconstant(op) && (unrolled ∈ reducedchildren(op))) +end +function isunrolled_sym(op::Operation, unrolled::Symbol, ::Int) + uild = unrolled ∈ loopdependencies(op) + (isconstant(op) & uild) && return true # ignore reduced children if unrolled + uild && unrolled ∉ reduceddependencies(op) +end + +isunrolled_sym(op::Operation, unrolled::Symbol, istiled::Bool) = istiled ? isunrolled_sym(op, unrolled, 0) : isunrolled_sym(op, unrolled, nothing) +isunrolled_sym(op::Operation, unrolled::Symbol, tiled::Symbol) = isunrolled_sym(op, unrolled, tiled ∈ loopdependencies(op)) + diff --git a/src/operation_evaluation_order.jl b/src/operation_evaluation_order.jl index 3bb0db39e..5876c7e90 100644 --- a/src/operation_evaluation_order.jl +++ b/src/operation_evaluation_order.jl @@ -18,7 +18,7 @@ function set_upstream_family!(adal::Vector{T}, op::Operation, val::T, ld::Vector{Symbol}, id::Int) where {T} adal[identifier(op)] == val && return # must already have been set - # ld != loopdependencies(op) && + # @show op if ld != loopdependencies(op) || id == identifier(op) (adal[identifier(op)] = val) end @@ -43,7 +43,9 @@ function addoptoorder!( istiled = (loopistiled ? (tiled ∈ loopdependencies(op)) : false) + 1 # optype = Int(op.node_type) + 1 after_loop = place_after_loop[id] + 1 + # @show place_after_loop[id], op isloopvalue(op) || push!(lo[isunrolled,istiled,after_loop,_n], op) + # all(opp -> iszero(length(reduceddependencies(opp))), parents(op)) && set_upstream_family!(place_after_loop, op, false, loopdependencies(op), identifier(op)) # parents that have already been included are not moved, so no need to check included_vars to filter nothing end @@ -54,7 +56,7 @@ function fillorder!(ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol, tiled: nloops = length(order) ops = operations(ls) nops = length(ops) - included_vars = fill!(resize!(ls.included_vars, nops), false) + included_vars = fill!(resize!(ls.included_vars, nops), false) place_after_loop = fill!(resize!(ls.place_after_loop, nops), true) # to go inside out, we just have to include all those not-yet included depending on the current sym empty!(lo) diff --git a/test/gemm.jl b/test/gemm.jl index 520fe0b0f..a353b8cd5 100644 --- a/test/gemm.jl +++ b/test/gemm.jl @@ -79,6 +79,14 @@ # end # end # end + # C = Cs; A = Ats'; B = Bs; factor = 1; + # ls = LoopVectorization.@avx_debug for m ∈ 1:size(A,1), n ∈ 1:size(B,2) + # ΔCₘₙ = zero(eltype(C)) + # for k ∈ 1:size(A,2) + # ΔCₘₙ += A[m,k] * B[k,n] + # end + # C[m,n] += ΔCₘₙ * factor + # end; function AmuladdBavx!(C, A, B, factor = 1) @avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2) ΔCₘₙ = zero(eltype(C)) @@ -486,6 +494,33 @@ end return C end + + function gemm_accurate!(C, A, B) + @avx for n in 1:size(C,2), m in 1:size(C,1) + Cmn_hi = zero(eltype(C)) + Cmn_lo = zero(eltype(C)) + for k in 1:size(B,1) + hiprod = evmul(A[m,k], B[k,n]) + loprod = vfmsub(A[m,k], B[k,n], hiprod) + hi_ts = evadd(hiprod, Cmn_hi) + a1_ts = evsub(hi_ts, Cmn_hi) + b1_ts = evsub(hi_ts, a1_ts) + lo_ts = evadd(evsub(hiprod, a1_ts), evsub(Cmn_hi, b1_ts)) + thi = evadd(loprod, Cmn_lo) + a1_t = evsub(thi, Cmn_lo) + b1_t = evsub(thi, a1_t) + tlo = evadd(evsub(loprod, a1_t), evsub(Cmn_lo, b1_t)) + c1 = evadd(lo_ts, thi) + hi_ths = evadd(hi_ts, c1) + lo_ths = evsub(c1, evsub(hi_ths, hi_ts)) + c2 = evadd(tlo, lo_ths) + Cmn_hi = evadd(hi_ths, c2) + Cmn_lo = evsub(c2, evsub(Cmn_hi, hi_ths)) + end + C[m,n] = Cmn_hi + end + end + # M = 77; # A = rand(M,M); B = rand(M,M); C = similar(A); # mulCAtB_2x2block_avx!(C,A,B) @@ -583,6 +618,14 @@ @test C ≈ C2 fill!(C, 9999.999); mulCAtB_2x2blockavx_noinline!(C, A', B); @test C ≈ C2 + fill!(C, 9999.999); gemm_accurate!(C, A, B); + @test C ≈ C2 + fill!(C, 9999.999); gemm_accurate!(C, At', B); + @test C ≈ C2 + fill!(C, 9999.999); gemm_accurate!(C, A, Bt'); + @test C ≈ C2 + fill!(C, 9999.999); gemm_accurate!(C, At', Bt'); + @test C ≈ C2 end @time @testset "_avx $T dynamic gemm" begin AmulB_avx1!(C, A, B) diff --git a/test/gemv.jl b/test/gemv.jl index 4b9982496..5f769c840 100644 --- a/test/gemv.jl +++ b/test/gemv.jl @@ -91,7 +91,7 @@ using Test end function AtmulvBavx3!(G, B,κ) d = size(G,1) - @avx for d1=1:d + @avx tile=(2,2) for d1=1:d G[d1,κ] = B[1,d1]*B[1,κ] for d2=2:d G[d1,κ] += B[d2,d1]*B[d2,κ] @@ -160,7 +160,7 @@ using Test @test y1 ≈ y2 fill!(y2, -999.9); mygemv_avx!(y2, A, x) @test y1 ≈ y2 - fill!(y2, -999.9) + fill!(y2, -999.9); mygemvavx_range!(y2, A, x) @test y1 ≈ y2 diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl index d1f4c07e0..606ca34a7 100644 --- a/test/miscellaneous.jl +++ b/test/miscellaneous.jl @@ -184,7 +184,7 @@ end p += pn end) - # lsb = LoopVectorization.LoopSet(bq); + lsb = LoopVectorization.LoopSet(bq); function clenshaw!(ret,x,coeff) @inbounds for j in 1:length(ret) diff --git a/test/offsetarrays.jl b/test/offsetarrays.jl index a32b3d0ac..acdd98805 100644 --- a/test/offsetarrays.jl +++ b/test/offsetarrays.jl @@ -1,12 +1,12 @@ using LoopVectorization, OffsetArrays using LoopVectorization.VectorizationBase: StaticUnitRange using Test -# T = Float64 +T = Float64 # T = Float32 @testset "OffsetArrays" begin - function old2d!(out::AbstractMatrix, A::AbstractMatrix, kern) + function old2d!(out::AbstractMatrix, A::AbstractMatrix, kern) rng1k, rng2k = axes(kern) rng1, rng2 = axes(out) for j in rng2, i in rng1 @@ -48,15 +48,15 @@ using Test # end); # lsq2d = LoopVectorization.LoopSet(q2d); LoopVectorization.choose_order(lsq2d) - # oq2 = :(for j in rng2, i in rng1 - # tmp = zero(eltype(out)) - # for jk in -1:1, ik in -1:1 - # tmp += A[i+ik,j+jk]*kern[ik,jk] - # end - # out[i,j] = tmp - # end); - # lsoq = LoopVectorization.LoopSet(oq2); - # LoopVectorization.choose_order(lsoq) + oq2 = :(for j in rng2, i in rng1 + tmp = zero(eltype(out)) + for jk in -1:1, ik in -1:1 + tmp += A[i+ik,j+jk]*kern[ik,jk] + end + out[i,j] = tmp + end); + lsoq = LoopVectorization.LoopSet(oq2); + LoopVectorization.choose_order(lsoq) function avx2d!(out::AbstractMatrix, A::AbstractMatrix, kern) rng1k, rng2k = axes(kern) diff --git a/test/printmethods.jl b/test/printmethods.jl index 6333187a1..0673b8af2 100644 --- a/test/printmethods.jl +++ b/test/printmethods.jl @@ -25,5 +25,5 @@ @test occursin(" = A[m, k]", s) @test occursin(" = B[k, n]", s) @test occursin(" = LoopVectorization.vfmadd", s) - @test occursin(" = LoopVectorization.reduce_to_add", s) + # @test occursin(" = LoopVectorization.reduce_to_add", s) end