diff --git a/.travis.yml b/.travis.yml
index 1161a891e..9ae0e585f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,3 +26,4 @@ env:
     - COVERALLS_PARALLEL=true
   notifications:
     webhooks: "https://coveralls.io/webhook"
+
diff --git a/Project.toml b/Project.toml
index f55fab830..c4cb7e313 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.6.28"
+version = "0.6.29"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
index 8cf4c7efe..7c6cb0d77 100644
--- a/src/LoopVectorization.jl
+++ b/src/LoopVectorization.jl
@@ -12,10 +12,12 @@ using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange, reduced_
     sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
     vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
     vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone
+using SLEEFPirates: pow
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
 using Base.Meta: isexpr
 
+
 const SUPPORTED_TYPES = Union{Float16,Float32,Float64,Integer}
 
 export LowDimArray, stridedpointer, vectorizable,
diff --git a/src/add_compute.jl b/src/add_compute.jl
index 3650bd26e..bf5ac95fd 100644
--- a/src/add_compute.jl
+++ b/src/add_compute.jl
@@ -180,6 +180,8 @@ function add_reduction_update_parent!(
     pushop!(ls, child, name(parent))
     opout
 end
+
+
 function add_compute!(
     ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int, position::Int,
     mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing
@@ -187,6 +189,7 @@ function add_compute!(
     @assert ex.head === :call
     instr = instruction(first(ex.args))::Symbol
     args = @view(ex.args[2:end])
+    (instr === :(^) && length(args) == 2 && (args[2] isa Number)) && return add_pow!(ls, var, args[1], args[2], elementbytes, position)
     parents = Operation[]
     deps = Symbol[]
     reduceddeps = Symbol[]
@@ -215,12 +218,11 @@ function add_compute!(
         end
     end
     reduction = reduction_ind > 0
+    loopnestview = view(ls.loopsymbols, 1:position)
     if iszero(length(deps)) && reduction
-        loopnestview = view(ls.loopsymbols, 1:position) 
         append!(deps, loopnestview)
         append!(reduceddeps, loopnestview)
     else
-        loopnestview = view(ls.loopsymbols, 1:position) 
         newloopdeps = Symbol[]; newreduceddeps = Symbol[];
         setdiffv!(newloopdeps, newreduceddeps, deps, loopnestview)
         mergesetv!(newreduceddeps, reduceddeps)
@@ -252,3 +254,60 @@ function add_compute!(
     pushop!(ls, op, LHS)
 end
 
+# adds x ^ (p::Real)
+function add_pow!(
+    ls::LoopSet, var::Symbol, x, p::Real, elementbytes::Int, position::Int
+)
+    xop = if x isa Expr
+        add_operation!(ls, gensym(:xpow), x, elementbytes, position)
+    elseif x isa Symbol
+        xo = get(ls.opdict, x, nothing)
+        if isnothing(xo)
+            pushpreamble!(ls, Expr(:(=), var, Expr(:call, :(^), x, p)))
+            return add_constant!(ls, var, elementbytes)
+        end
+        xo
+    elseif x isa Number
+        pushpreamble!(ls, Expr(:(=), var, x ^ p))
+        return add_constant!(ls, var, elementbytes)
+    end
+    pint = round(Int, p)
+    if p != pint
+        pop = add_constant!(ls, p, elementbytes)
+        return add_compute!(ls, var, :^, [xop, pop], elementbytes)
+    end
+    if pint == -1
+        return add_compute!(ls, var, :vinv, [xop], elementbytes)
+    elseif pint < 0
+        xop = add_compute!(ls, gensym(:inverse), :vinv, [xop], elementbytes)
+        pint = - pint
+    end
+    if pint == 0
+        op = Operation(length(operations(ls)), var, elementbytes, LOOPCONSTANT, constant, NODEPENDENCY, Symbol[], NOPARENTS)
+        push!(ls.preamble_ones, (identifier(op),IntOrFloat))
+        return pushop!(ls, op)
+    elseif pint == 1
+        return add_compute!(ls, var, :identity, [xop], elementbytes)
+    elseif pint == 2
+        return add_compute!(ls, var, :vabs2, [xop], elementbytes)
+    end
+
+    # Implementation from https://github.com/JuliaLang/julia/blob/a965580ba7fd0e8314001521df254e30d686afbf/base/intfuncs.jl#L216
+    t = trailing_zeros(pint) + 1
+    pint >>= t
+    while (t -= 1) > 0
+        varname = (iszero(pint) && isone(t)) ? var : gensym(:pbs)
+        xop = add_compute!(ls, varname, :vabs2, [xop], elementbytes)
+    end
+    yop = xop
+    while pint > 0
+        t = trailing_zeros(pint) + 1
+        pint >>= t
+        while (t -= 1) >= 0
+            xop = add_compute!(ls, gensym(:pbs), :vabs2, [xop], elementbytes)
+        end
+        yop = add_compute!(ls, iszero(pint) ? var : gensym(:pbs), :vmul, [xop, yop], elementbytes)
+    end
+    yop
+end
+
diff --git a/src/graphs.jl b/src/graphs.jl
index 810f6aeb7..c216b48ae 100644
--- a/src/graphs.jl
+++ b/src/graphs.jl
@@ -244,7 +244,7 @@ end
 
 includesarray(ls::LoopSet, array::Symbol) = array ∈ ls.includedarrays
 
-function LoopSet(mod::Symbol)# = :LoopVectorization)
+function LoopSet(mod::Symbol, W = Symbol("##Wvecwidth##"), T = Symbol("Tloopeltype"))# = :LoopVectorization)
     LoopSet(
         Symbol[], [0], Loop[],
         Dict{Symbol,Operation}(),
@@ -261,7 +261,7 @@ function LoopSet(mod::Symbol)# = :LoopVectorization)
         Matrix{Float64}(undef, 4, 2),
         Matrix{Float64}(undef, 4, 2),
         Bool[], Bool[],
-        gensym(:W), gensym(:T), mod
+        W, T, mod
     )
 end
 
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
index bab2500a7..576c4f036 100644
--- a/src/reconstruct_loopset.jl
+++ b/src/reconstruct_loopset.jl
@@ -413,7 +413,10 @@ function _avx_loopset(OPSsv, ARFsv, AMsv, LPSYMsv, LBsv, vargs)
     )
 end
 @generated function _avx_!(::Val{UT}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UT, OPS, ARF, AM, LPSYM, LB}
+    1 + 1
     ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
     avx_body(ls, UT)
 end
 
+
+    
diff --git a/src/split_loops.jl b/src/split_loops.jl
index b54441582..4ef86d18a 100644
--- a/src/split_loops.jl
+++ b/src/split_loops.jl
@@ -70,16 +70,17 @@ function lower_and_split_loops(ls::LoopSet)
         remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
         ls_2 = split_loopset(ls, remaining_ops)
         order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2 = choose_order_cost(ls_2)
+        # U_1 = T_1 = U_2 = T_2 = 2
         if cost_1 + cost_2 < cost_fused
             ls_2_lowered = if length(remaining_ops) > 1
                 lower_and_split_loops(ls_2)
             else
-                lower(ls_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2)
+                lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2)
             end
             return Expr(
                 :block,
                 ls.preamble,
-                lower(ls_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1),
+                lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1),
                 ls_2_lowered
             )
         end
diff --git a/test/gemm.jl b/test/gemm.jl
index abd12c881..6e91fb6c3 100644
--- a/test/gemm.jl
+++ b/test/gemm.jl
@@ -521,6 +521,13 @@
         end
     end
 
+    function twogemms!(Ab, Bb, Cb, A, B)
+        M, N = size(C); K = size(B,1)
+        @avx for m in 1:M, k in 1:K, n in 1:N
+            Ab[m,k] += Cb[m,n] * B[k,n]
+            Bb[k,n] += A[m,k] * Cb[m,n]
+        end
+    end
     # M = 77;
     # A = rand(M,M); B = rand(M,M); C = similar(A);
     # mulCAtB_2x2block_avx!(C,A,B)
@@ -632,6 +639,10 @@
             Bbit = B .> 0.5
             fill!(C, 9999.999); AmulBavx1!(C, A, Bbit)
             @test C ≈ A * Bbit
+            Ab = zero(A); Bb = zero(B);
+            twogemms!(Ab, Bb, C, A, B)
+            @test Ab ≈ C * B'
+            @test Bb ≈ A' * C
         end
         @time @testset "_avx $T dynamic gemm" begin
             AmulB_avx1!(C, A, B)
diff --git a/test/special.jl b/test/special.jl
index a646efd0f..15353453c 100644
--- a/test/special.jl
+++ b/test/special.jl
@@ -170,6 +170,82 @@
             end)
     lsfeq = LoopVectorization.LoopSet(feq);
     # lsfeq.operations
+
+    function vpow0!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ 0
+        end; y
+    end
+    function vpown1!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ -1
+        end; y
+    end
+    function vpow1!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ 1
+        end; y
+    end
+    function vpown2!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ -2
+        end; y
+    end
+    function vpow2!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ 2
+        end; y
+    end
+    function vpown3!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ -3
+        end; y
+    end
+    function vpow3!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ 3
+        end; y
+    end
+    function vpown4!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ -4
+        end; y
+    end
+    function vpow4!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ 4
+        end; y
+    end
+    function vpown5!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ -5
+        end; y
+    end
+    q = :(for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ -5
+          end);
+    ls = LoopVectorization.LoopSet(q);
+    
+    function vpow5!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ 5
+        end; y
+    end
+    function vpowf!(y, x)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ 2.3
+        end; y
+    end
+    function vpowf!(y, x, p::Number)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ p
+        end; y
+    end
+    function vpowf!(y, x, p::AbstractArray)
+        @avx for i ∈ eachindex(y, x)
+            y[i] = x[i] ^ p[i]
+        end; y
+    end
     
     
     for T ∈ (Float32, Float64)
@@ -193,7 +269,7 @@
         @test ld ≈ trianglelogdetavx(A)
         @test ld ≈ trianglelogdet_avx(A)
 
-        x = rand(T, 1000);
+        x = rand(T, 999);
         r1 = similar(x);
         r2 = similar(x);
         lse = logsumexp!(r1, x);
@@ -218,5 +294,21 @@
         @test A1 ≈ A2
         fill!(A2, 0); offset_exp_avx!(A2, B)
         @test A1 ≈ A2
+
+        @test all(isone, vpow0!(r1, x))
+        @test vpown1!(r1, x) ≈ map!(inv, r2, x)
+        @test vpow1!(r1, x) == x
+        @test vpown2!(r1, x) ≈ map!(abs2 ∘ inv, r2, x)
+        @test vpow2!(r1, x) ≈ map!(abs2, r2, x)
+        @test vpown3!(r1, x) ≈ (r2 .= x .^ -3)
+        @test vpow3!(r1, x) ≈ (r2 .= x .^ 3)
+        @test vpown4!(r1, x) ≈ (r2 .= x .^ -4)
+        @test vpow4!(r1, x) ≈ (r2 .= x .^ 4)
+        @test vpown5!(r1, x) ≈ (r2 .= x .^ -5)
+        @test vpow5!(r1, x) ≈ (r2 .= x .^ 5)
+        @test vpowf!(r1, x) ≈ (r2 .= x .^ 2.3)
+        @test vpowf!(r1, x, -1.7) ≈ (r2 .= x .^ -1.7)
+        p = randn(length(x));
+        @test vpowf!(r1, x, x) ≈ (r2 .= x .^ x)
     end
 end