diff --git a/.travis.yml b/.travis.yml index 1161a891e..9ae0e585f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,3 +26,4 @@ env: - COVERALLS_PARALLEL=true notifications: webhooks: "https://coveralls.io/webhook" + diff --git a/Project.toml b/Project.toml index f55fab830..c4cb7e313 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LoopVectorization" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" authors = ["Chris Elrod "] -version = "0.6.28" +version = "0.6.29" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index 8cf4c7efe..7c6cb0d77 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -12,10 +12,12 @@ using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange, reduced_ sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch, vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone +using SLEEFPirates: pow using Base.Broadcast: Broadcasted, DefaultArrayStyle using LinearAlgebra: Adjoint, Transpose using Base.Meta: isexpr + const SUPPORTED_TYPES = Union{Float16,Float32,Float64,Integer} export LowDimArray, stridedpointer, vectorizable, diff --git a/src/add_compute.jl b/src/add_compute.jl index 3650bd26e..bf5ac95fd 100644 --- a/src/add_compute.jl +++ b/src/add_compute.jl @@ -180,6 +180,8 @@ function add_reduction_update_parent!( pushop!(ls, child, name(parent)) opout end + + function add_compute!( ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int, position::Int, mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing @@ -187,6 +189,7 @@ function add_compute!( @assert ex.head === :call instr = instruction(first(ex.args))::Symbol args = @view(ex.args[2:end]) + (instr === :(^) && length(args) == 2 && (args[2] isa Number)) && return add_pow!(ls, var, args[1], args[2], elementbytes, position) parents = Operation[] deps = Symbol[] reduceddeps = Symbol[] @@ -215,12 +218,11 @@ function add_compute!( end end reduction = reduction_ind > 0 + loopnestview = view(ls.loopsymbols, 1:position) if iszero(length(deps)) && reduction - loopnestview = view(ls.loopsymbols, 1:position) append!(deps, loopnestview) append!(reduceddeps, loopnestview) else - loopnestview = view(ls.loopsymbols, 1:position) newloopdeps = Symbol[]; newreduceddeps = Symbol[]; setdiffv!(newloopdeps, newreduceddeps, deps, loopnestview) mergesetv!(newreduceddeps, reduceddeps) @@ -252,3 +254,60 @@ function add_compute!( pushop!(ls, op, LHS) end +# adds x ^ (p::Real) +function add_pow!( + ls::LoopSet, var::Symbol, x, p::Real, elementbytes::Int, position::Int +) + xop = if x isa Expr + add_operation!(ls, gensym(:xpow), x, elementbytes, position) + elseif x isa Symbol + xo = get(ls.opdict, x, nothing) + if isnothing(xo) + pushpreamble!(ls, Expr(:(=), var, Expr(:call, :(^), x, p))) + return add_constant!(ls, var, elementbytes) + end + xo + elseif x isa Number + pushpreamble!(ls, Expr(:(=), var, x ^ p)) + return add_constant!(ls, var, elementbytes) + end + pint = round(Int, p) + if p != pint + pop = add_constant!(ls, p, elementbytes) + return add_compute!(ls, var, :^, [xop, pop], elementbytes) + end + if pint == -1 + return add_compute!(ls, var, :vinv, [xop], elementbytes) + elseif pint < 0 + xop = add_compute!(ls, gensym(:inverse), :vinv, [xop], elementbytes) + pint = - pint + end + if pint == 0 + op = Operation(length(operations(ls)), var, elementbytes, LOOPCONSTANT, constant, NODEPENDENCY, Symbol[], NOPARENTS) + push!(ls.preamble_ones, (identifier(op),IntOrFloat)) + return pushop!(ls, op) + elseif pint == 1 + return add_compute!(ls, var, :identity, [xop], elementbytes) + elseif pint == 2 + return add_compute!(ls, var, :vabs2, [xop], elementbytes) + end + + # Implementation from https://github.com/JuliaLang/julia/blob/a965580ba7fd0e8314001521df254e30d686afbf/base/intfuncs.jl#L216 + t = trailing_zeros(pint) + 1 + pint >>= t + while (t -= 1) > 0 + varname = (iszero(pint) && isone(t)) ? var : gensym(:pbs) + xop = add_compute!(ls, varname, :vabs2, [xop], elementbytes) + end + yop = xop + while pint > 0 + t = trailing_zeros(pint) + 1 + pint >>= t + while (t -= 1) >= 0 + xop = add_compute!(ls, gensym(:pbs), :vabs2, [xop], elementbytes) + end + yop = add_compute!(ls, iszero(pint) ? var : gensym(:pbs), :vmul, [xop, yop], elementbytes) + end + yop +end + diff --git a/src/graphs.jl b/src/graphs.jl index 810f6aeb7..c216b48ae 100644 --- a/src/graphs.jl +++ b/src/graphs.jl @@ -244,7 +244,7 @@ end includesarray(ls::LoopSet, array::Symbol) = array ∈ ls.includedarrays -function LoopSet(mod::Symbol)# = :LoopVectorization) +function LoopSet(mod::Symbol, W = Symbol("##Wvecwidth##"), T = Symbol("Tloopeltype"))# = :LoopVectorization) LoopSet( Symbol[], [0], Loop[], Dict{Symbol,Operation}(), @@ -261,7 +261,7 @@ function LoopSet(mod::Symbol)# = :LoopVectorization) Matrix{Float64}(undef, 4, 2), Matrix{Float64}(undef, 4, 2), Bool[], Bool[], - gensym(:W), gensym(:T), mod + W, T, mod ) end diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl index bab2500a7..576c4f036 100644 --- a/src/reconstruct_loopset.jl +++ b/src/reconstruct_loopset.jl @@ -413,7 +413,10 @@ function _avx_loopset(OPSsv, ARFsv, AMsv, LPSYMsv, LBsv, vargs) ) end @generated function _avx_!(::Val{UT}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UT, OPS, ARF, AM, LPSYM, LB} + 1 + 1 ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs) avx_body(ls, UT) end + + diff --git a/src/split_loops.jl b/src/split_loops.jl index b54441582..4ef86d18a 100644 --- a/src/split_loops.jl +++ b/src/split_loops.jl @@ -70,16 +70,17 @@ function lower_and_split_loops(ls::LoopSet) remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end]) ls_2 = split_loopset(ls, remaining_ops) order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2 = choose_order_cost(ls_2) + # U_1 = T_1 = U_2 = T_2 = 2 if cost_1 + cost_2 < cost_fused ls_2_lowered = if length(remaining_ops) > 1 lower_and_split_loops(ls_2) else - lower(ls_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2) + lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2) end return Expr( :block, ls.preamble, - lower(ls_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1), + lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1), ls_2_lowered ) end diff --git a/test/gemm.jl b/test/gemm.jl index abd12c881..6e91fb6c3 100644 --- a/test/gemm.jl +++ b/test/gemm.jl @@ -521,6 +521,13 @@ end end + function twogemms!(Ab, Bb, Cb, A, B) + M, N = size(C); K = size(B,1) + @avx for m in 1:M, k in 1:K, n in 1:N + Ab[m,k] += Cb[m,n] * B[k,n] + Bb[k,n] += A[m,k] * Cb[m,n] + end + end # M = 77; # A = rand(M,M); B = rand(M,M); C = similar(A); # mulCAtB_2x2block_avx!(C,A,B) @@ -632,6 +639,10 @@ Bbit = B .> 0.5 fill!(C, 9999.999); AmulBavx1!(C, A, Bbit) @test C ≈ A * Bbit + Ab = zero(A); Bb = zero(B); + twogemms!(Ab, Bb, C, A, B) + @test Ab ≈ C * B' + @test Bb ≈ A' * C end @time @testset "_avx $T dynamic gemm" begin AmulB_avx1!(C, A, B) diff --git a/test/special.jl b/test/special.jl index a646efd0f..15353453c 100644 --- a/test/special.jl +++ b/test/special.jl @@ -170,6 +170,82 @@ end) lsfeq = LoopVectorization.LoopSet(feq); # lsfeq.operations + + function vpow0!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ 0 + end; y + end + function vpown1!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ -1 + end; y + end + function vpow1!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ 1 + end; y + end + function vpown2!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ -2 + end; y + end + function vpow2!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ 2 + end; y + end + function vpown3!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ -3 + end; y + end + function vpow3!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ 3 + end; y + end + function vpown4!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ -4 + end; y + end + function vpow4!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ 4 + end; y + end + function vpown5!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ -5 + end; y + end + q = :(for i ∈ eachindex(y, x) + y[i] = x[i] ^ -5 + end); + ls = LoopVectorization.LoopSet(q); + + function vpow5!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ 5 + end; y + end + function vpowf!(y, x) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ 2.3 + end; y + end + function vpowf!(y, x, p::Number) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ p + end; y + end + function vpowf!(y, x, p::AbstractArray) + @avx for i ∈ eachindex(y, x) + y[i] = x[i] ^ p[i] + end; y + end for T ∈ (Float32, Float64) @@ -193,7 +269,7 @@ @test ld ≈ trianglelogdetavx(A) @test ld ≈ trianglelogdet_avx(A) - x = rand(T, 1000); + x = rand(T, 999); r1 = similar(x); r2 = similar(x); lse = logsumexp!(r1, x); @@ -218,5 +294,21 @@ @test A1 ≈ A2 fill!(A2, 0); offset_exp_avx!(A2, B) @test A1 ≈ A2 + + @test all(isone, vpow0!(r1, x)) + @test vpown1!(r1, x) ≈ map!(inv, r2, x) + @test vpow1!(r1, x) == x + @test vpown2!(r1, x) ≈ map!(abs2 ∘ inv, r2, x) + @test vpow2!(r1, x) ≈ map!(abs2, r2, x) + @test vpown3!(r1, x) ≈ (r2 .= x .^ -3) + @test vpow3!(r1, x) ≈ (r2 .= x .^ 3) + @test vpown4!(r1, x) ≈ (r2 .= x .^ -4) + @test vpow4!(r1, x) ≈ (r2 .= x .^ 4) + @test vpown5!(r1, x) ≈ (r2 .= x .^ -5) + @test vpow5!(r1, x) ≈ (r2 .= x .^ 5) + @test vpowf!(r1, x) ≈ (r2 .= x .^ 2.3) + @test vpowf!(r1, x, -1.7) ≈ (r2 .= x .^ -1.7) + p = randn(length(x)); + @test vpowf!(r1, x, x) ≈ (r2 .= x .^ x) end end