Skip to content

Commit

Permalink
Fix bug in loop splitting, and add literal ^ support (fixes #85).
Browse files Browse the repository at this point in the history
  • Loading branch information
chriselrod committed Apr 6, 2020
1 parent 6d299e0 commit 89676e0
Show file tree
Hide file tree
Showing 9 changed files with 177 additions and 8 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ env:
- COVERALLS_PARALLEL=true
notifications:
webhooks: "https://coveralls.io/webhook"

2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LoopVectorization"
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
authors = ["Chris Elrod <[email protected]>"]
version = "0.6.28"
version = "0.6.29"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
2 changes: 2 additions & 0 deletions src/LoopVectorization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange, reduced_
sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone
using SLEEFPirates: pow
using Base.Broadcast: Broadcasted, DefaultArrayStyle
using LinearAlgebra: Adjoint, Transpose
using Base.Meta: isexpr


const SUPPORTED_TYPES = Union{Float16,Float32,Float64,Integer}

export LowDimArray, stridedpointer, vectorizable,
Expand Down
63 changes: 61 additions & 2 deletions src/add_compute.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,16 @@ function add_reduction_update_parent!(
pushop!(ls, child, name(parent))
opout
end


function add_compute!(
ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int, position::Int,
mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing
)
@assert ex.head === :call
instr = instruction(first(ex.args))::Symbol
args = @view(ex.args[2:end])
(instr === :(^) && length(args) == 2 && (args[2] isa Number)) && return add_pow!(ls, var, args[1], args[2], elementbytes, position)
parents = Operation[]
deps = Symbol[]
reduceddeps = Symbol[]
Expand Down Expand Up @@ -215,12 +218,11 @@ function add_compute!(
end
end
reduction = reduction_ind > 0
loopnestview = view(ls.loopsymbols, 1:position)
if iszero(length(deps)) && reduction
loopnestview = view(ls.loopsymbols, 1:position)
append!(deps, loopnestview)
append!(reduceddeps, loopnestview)
else
loopnestview = view(ls.loopsymbols, 1:position)
newloopdeps = Symbol[]; newreduceddeps = Symbol[];
setdiffv!(newloopdeps, newreduceddeps, deps, loopnestview)
mergesetv!(newreduceddeps, reduceddeps)
Expand Down Expand Up @@ -252,3 +254,60 @@ function add_compute!(
pushop!(ls, op, LHS)
end

# adds x ^ (p::Real)
function add_pow!(
ls::LoopSet, var::Symbol, x, p::Real, elementbytes::Int, position::Int
)
xop = if x isa Expr
add_operation!(ls, gensym(:xpow), x, elementbytes, position)
elseif x isa Symbol
xo = get(ls.opdict, x, nothing)
if isnothing(xo)
pushpreamble!(ls, Expr(:(=), var, Expr(:call, :(^), x, p)))
return add_constant!(ls, var, elementbytes)
end
xo
elseif x isa Number
pushpreamble!(ls, Expr(:(=), var, x ^ p))
return add_constant!(ls, var, elementbytes)
end
pint = round(Int, p)
if p != pint
pop = add_constant!(ls, p, elementbytes)
return add_compute!(ls, var, :^, [xop, pop], elementbytes)
end
if pint == -1
return add_compute!(ls, var, :vinv, [xop], elementbytes)
elseif pint < 0
xop = add_compute!(ls, gensym(:inverse), :vinv, [xop], elementbytes)
pint = - pint
end
if pint == 0
op = Operation(length(operations(ls)), var, elementbytes, LOOPCONSTANT, constant, NODEPENDENCY, Symbol[], NOPARENTS)
push!(ls.preamble_ones, (identifier(op),IntOrFloat))
return pushop!(ls, op)
elseif pint == 1
return add_compute!(ls, var, :identity, [xop], elementbytes)
elseif pint == 2
return add_compute!(ls, var, :vabs2, [xop], elementbytes)
end

# Implementation from https://github.com/JuliaLang/julia/blob/a965580ba7fd0e8314001521df254e30d686afbf/base/intfuncs.jl#L216
t = trailing_zeros(pint) + 1
pint >>= t
while (t -= 1) > 0
varname = (iszero(pint) && isone(t)) ? var : gensym(:pbs)
xop = add_compute!(ls, varname, :vabs2, [xop], elementbytes)
end
yop = xop
while pint > 0
t = trailing_zeros(pint) + 1
pint >>= t
while (t -= 1) >= 0
xop = add_compute!(ls, gensym(:pbs), :vabs2, [xop], elementbytes)
end
yop = add_compute!(ls, iszero(pint) ? var : gensym(:pbs), :vmul, [xop, yop], elementbytes)
end
yop
end

4 changes: 2 additions & 2 deletions src/graphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ end

includesarray(ls::LoopSet, array::Symbol) = array ls.includedarrays

function LoopSet(mod::Symbol)# = :LoopVectorization)
function LoopSet(mod::Symbol, W = Symbol("##Wvecwidth##"), T = Symbol("Tloopeltype"))# = :LoopVectorization)
LoopSet(
Symbol[], [0], Loop[],
Dict{Symbol,Operation}(),
Expand All @@ -261,7 +261,7 @@ function LoopSet(mod::Symbol)# = :LoopVectorization)
Matrix{Float64}(undef, 4, 2),
Matrix{Float64}(undef, 4, 2),
Bool[], Bool[],
gensym(:W), gensym(:T), mod
W, T, mod
)
end

Expand Down
3 changes: 3 additions & 0 deletions src/reconstruct_loopset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,10 @@ function _avx_loopset(OPSsv, ARFsv, AMsv, LPSYMsv, LBsv, vargs)
)
end
@generated function _avx_!(::Val{UT}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UT, OPS, ARF, AM, LPSYM, LB}
1 + 1
ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
avx_body(ls, UT)
end



5 changes: 3 additions & 2 deletions src/split_loops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,17 @@ function lower_and_split_loops(ls::LoopSet)
remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
ls_2 = split_loopset(ls, remaining_ops)
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2 = choose_order_cost(ls_2)
# U_1 = T_1 = U_2 = T_2 = 2
if cost_1 + cost_2 < cost_fused
ls_2_lowered = if length(remaining_ops) > 1
lower_and_split_loops(ls_2)
else
lower(ls_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2)
lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2)
end
return Expr(
:block,
ls.preamble,
lower(ls_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1),
lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1),
ls_2_lowered
)
end
Expand Down
11 changes: 11 additions & 0 deletions test/gemm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,13 @@
end
end

function twogemms!(Ab, Bb, Cb, A, B)
M, N = size(C); K = size(B,1)
@avx for m in 1:M, k in 1:K, n in 1:N
Ab[m,k] += Cb[m,n] * B[k,n]
Bb[k,n] += A[m,k] * Cb[m,n]
end
end
# M = 77;
# A = rand(M,M); B = rand(M,M); C = similar(A);
# mulCAtB_2x2block_avx!(C,A,B)
Expand Down Expand Up @@ -632,6 +639,10 @@
Bbit = B .> 0.5
fill!(C, 9999.999); AmulBavx1!(C, A, Bbit)
@test C A * Bbit
Ab = zero(A); Bb = zero(B);
twogemms!(Ab, Bb, C, A, B)
@test Ab C * B'
@test Bb A' * C
end
@time @testset "_avx $T dynamic gemm" begin
AmulB_avx1!(C, A, B)
Expand Down
94 changes: 93 additions & 1 deletion test/special.jl
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,82 @@
end)
lsfeq = LoopVectorization.LoopSet(feq);
# lsfeq.operations

function vpow0!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ 0
end; y
end
function vpown1!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ -1
end; y
end
function vpow1!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ 1
end; y
end
function vpown2!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ -2
end; y
end
function vpow2!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ 2
end; y
end
function vpown3!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ -3
end; y
end
function vpow3!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ 3
end; y
end
function vpown4!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ -4
end; y
end
function vpow4!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ 4
end; y
end
function vpown5!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ -5
end; y
end
q = :(for i eachindex(y, x)
y[i] = x[i] ^ -5
end);
ls = LoopVectorization.LoopSet(q);

function vpow5!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ 5
end; y
end
function vpowf!(y, x)
@avx for i eachindex(y, x)
y[i] = x[i] ^ 2.3
end; y
end
function vpowf!(y, x, p::Number)
@avx for i eachindex(y, x)
y[i] = x[i] ^ p
end; y
end
function vpowf!(y, x, p::AbstractArray)
@avx for i eachindex(y, x)
y[i] = x[i] ^ p[i]
end; y
end


for T (Float32, Float64)
Expand All @@ -193,7 +269,7 @@
@test ld trianglelogdetavx(A)
@test ld trianglelogdet_avx(A)

x = rand(T, 1000);
x = rand(T, 999);
r1 = similar(x);
r2 = similar(x);
lse = logsumexp!(r1, x);
Expand All @@ -218,5 +294,21 @@
@test A1 A2
fill!(A2, 0); offset_exp_avx!(A2, B)
@test A1 A2

@test all(isone, vpow0!(r1, x))
@test vpown1!(r1, x) map!(inv, r2, x)
@test vpow1!(r1, x) == x
@test vpown2!(r1, x) map!(abs2 inv, r2, x)
@test vpow2!(r1, x) map!(abs2, r2, x)
@test vpown3!(r1, x) (r2 .= x .^ -3)
@test vpow3!(r1, x) (r2 .= x .^ 3)
@test vpown4!(r1, x) (r2 .= x .^ -4)
@test vpow4!(r1, x) (r2 .= x .^ 4)
@test vpown5!(r1, x) (r2 .= x .^ -5)
@test vpow5!(r1, x) (r2 .= x .^ 5)
@test vpowf!(r1, x) (r2 .= x .^ 2.3)
@test vpowf!(r1, x, -1.7) (r2 .= x .^ -1.7)
p = randn(length(x));
@test vpowf!(r1, x, x) (r2 .= x .^ x)
end
end

1 comment on commit 89676e0

@timholy
Copy link
Contributor

@timholy timholy commented on 89676e0 Apr 6, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Please sign in to comment.