Skip to content

Add Metal and CUDA as extensions #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
fail-fast: false
matrix:
version:
- '1.3' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'.
- '1.9' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'.
- '1' # Leave this line unchanged. '1' will automatically expand to the latest stable 1.x release of Julia.
os:
- ubuntu-latest
Expand Down
20 changes: 14 additions & 6 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,25 +1,33 @@
name = "FixedEffects"
uuid = "c8885935-8500-56a7-9867-7708b20db0eb"
version = "2.1.2"
version = "2.2.0"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
GroupedArrays = "6407cd72-fade-4a84-8a1e-56e431fc1533"

[compat]
Requires = "1"
StatsBase = "0.33, 0.34"
GroupedArrays = "0.3"
julia = "1.3"
julia = "1.9"

[extras]
[extensions]
CUDAExt = "CUDA"
MetalExt = "Metal"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"

[extras]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "CUDA", "PooledArrays", "CategoricalArrays"]
test = ["CategoricalArrays", "CUDA", "Metal", "PooledArrays", "Test"]

12 changes: 1 addition & 11 deletions benchmarks/.sublime2Terminal.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1 @@
using FixedEffects, Random, Statistics
Random.seed!(1234)
N = 10000000
K = 100
id1 = rand(1:div(N, K), N)
id2 = rand(1:K, N)
fes = [FixedEffect(id1), FixedEffect(id2)]
x = rand(N)

# simple problem
@time solve_residuals!(deepcopy(x), fes)
@time solve_residuals!(deepcopy(x), fes)
20 changes: 10 additions & 10 deletions benchmarks/benchmark_gpu.jl → benchmarks/benchmark_CUDA.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using FixedEffects, CategoricalArrays, Random, CUDA, BenchmarkTools, Statistics
using CUDA, FixedEffects, CategoricalArrays, Random, BenchmarkTools, Statistics

Random.seed!(1234)
N = 10000000
Expand All @@ -10,8 +10,8 @@ id3 = categorical(Int.(rand(1:(N/K), N)))
x = rand(N)
X = rand(N, 10)

(r_g, it_g, conv_g)=solve_residuals!(copy(x), deepcopy([FixedEffect(id1), FixedEffect(id2)]), method = :lsmr_gpu)
(r_c, it_c, conv_c)=solve_residuals!(copy(x), deepcopy([FixedEffect(id1), FixedEffect(id2)]), method = :lsmr);
(r_g, it_g, conv_g)=solve_residuals!(copy(x), deepcopy([FixedEffect(id1), FixedEffect(id2)]), method = :CUDA)
(r_c, it_c, conv_c)=solve_residuals!(copy(x), deepcopy([FixedEffect(id1), FixedEffect(id2)]));


@show r_c ≈ r_g
Expand All @@ -20,19 +20,19 @@ X = rand(N, 10)


println("CPU Float64, size(x)=$(size(x)), 2 fixed effects")
@btime (r_c, it_c, conv_c)=solve_residuals!($x, [FixedEffect(id1), FixedEffect(id2)], method = :lsmr)
@btime (r_c, it_c, conv_c)=solve_residuals!($x, [FixedEffect(id1), FixedEffect(id2)])
println("CPU Float32, size(x)=$(size(x)), 2 fixed effects")
@btime (r_c, it_c, conv_c)=solve_residuals!(Float32.(x), [FixedEffect(id1), FixedEffect(id2)], method = :lsmr)
@btime (r_c, it_c, conv_c)=solve_residuals!(Float32.(x), [FixedEffect(id1), FixedEffect(id2)])
println("GPU Float64, size(x)=$(size(x)), 2 fixed effects")
@btime (r_g, it_g, conv_g)=solve_residuals!($x, [FixedEffect(id1), FixedEffect(id2)], method = :lsmr_gpu)
@btime (r_g, it_g, conv_g)=solve_residuals!($x, [FixedEffect(id1), FixedEffect(id2)], method = :CUDA)
println("GPU Float32, size(x)=$(size(x)), 2 fixed effects")
@btime (r_g, it_g, conv_g)=solve_residuals!(Float32.(x), [FixedEffect(id1), FixedEffect(id2)], method = :lsmr_gpu)
@btime (r_g, it_g, conv_g)=solve_residuals!(Float32.(x), [FixedEffect(id1), FixedEffect(id2)], method = :CUDA)


# Note that most of the GPU time is spend transferring memory
using Profile
Profile.clear()
@profile solve_residuals!(x, [FixedEffect(id1), FixedEffect(id2)], method = :lsmr_gpu);
@profile solve_residuals!(x, [FixedEffect(id1), FixedEffect(id2)], method = :CUDA);
Profile.print(noisefloor=1.0)


Expand Down Expand Up @@ -64,13 +64,13 @@ println("CPU Float32")

println("GPU Float64")
@time (rg6,i,c)=solve_residuals!(copy(X), [FixedEffect(pid), FixedEffect(fid)],
method = :lsmr_gpu,
method = :CUDA,
tol=sqrt(eps(Float32)));
@show i

println("GPU Float32")
@time (rg3,i,c)=solve_residuals!(Float32.(X), [FixedEffect(pid), FixedEffect(fid)],
method = :lsmr_gpu,
method = :CUDA,
tol=sqrt(eps(Float32)));
@show i
@show mean(rg3)
Expand Down
39 changes: 39 additions & 0 deletions benchmarks/benchmark_Metal.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
using Revise, Metal, FixedEffects, Random, Statistics
Random.seed!(1234)
N = 10000000
K = 100
id1 = rand(1:div(N, K), N)
id2 = rand(1:K, N)
fes = [FixedEffect(id1), FixedEffect(id2)]
x = Float32.(rand(N))

# simple problem
@time solve_residuals!(deepcopy(x), fes)
# 0.654833 seconds (1.99 k allocations: 390.841 MiB, 3.71% gc time)
@time solve_residuals!(deepcopy(x), fes; method = :Metal)
# 0.298326 seconds (129.08 k allocations: 79.208 MiB)
@time solve_residuals!([x x x x], fes)
# 1.604061 seconds (1.25 M allocations: 416.364 MiB, 4.21% gc time, 30.57% compilation time)
@time solve_residuals!([x x x x], fes; method = :Metal)
# 0.790909 seconds (531.78 k allocations: 204.363 MiB, 3.19% compilation time)



# More complicated problem
N = 800000 # number of observations
M = 400000 # number of workers
O = 50000 # number of firms
Random.seed!(1234)
pid = rand(1:M, N)
fid = [rand(max(1, div(x, 8)-10):min(O, div(x, 8)+10)) for x in pid]
x = rand(N)
fes = [FixedEffect(pid), FixedEffect(fid)]


@time solve_residuals!([x x x x], fes; double_precision = false)
# 8.294446 seconds (225.13 k allocations: 67.777 MiB, 0.11% gc time)

@time solve_residuals!([x x x x], fes; double_precision = false, method = :Metal)
# 1.605953 seconds (3.25 M allocations: 103.342 MiB, 1.82% gc time)


153 changes: 153 additions & 0 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
module CUDAExt
using FixedEffects, CUDA
using FixedEffects: FixedEffectCoefficients, AbstractWeights, UnitWeights, LinearAlgebra, Adjoint, mul!, rmul!, lsmr!, AbstractFixedEffectLinearMap
CUDA.allowscalar(false)

##############################################################################
##
## Conversion FixedEffect between CPU and GPU
##
##############################################################################

# https://github.com/JuliaGPU/CUDA.jl/issues/142
function _cu(T::Type, fe::FixedEffect)
refs = CuArray(fe.refs)
interaction = _cu(T, fe.interaction)
FixedEffect{typeof(refs), typeof(interaction)}(refs, interaction, fe.n)
end
_cu(T::Type, w::UnitWeights) = fill!(CuVector{T}(undef, length(w)), w[1])
_cu(T::Type, w::AbstractVector) = CuVector{T}(convert(Vector{T}, w))

##############################################################################
##
## FixedEffectLinearMap on the GPU (code by Paul Schrimpf)
##
## Model matrix of categorical variables
## mutiplied by diag(1/sqrt(∑w * interaction^2, ..., ∑w * interaction^2) (Jacobi preconditoner)
##
## We define these methods used in lsmr! (duck typing):
## eltype
## size
## mul!
##
##############################################################################

mutable struct FixedEffectLinearMapCUDA{T} <: AbstractFixedEffectLinearMap{T}
fes::Vector{<:FixedEffect}
scales::Vector{<:AbstractVector}
caches::Vector{<:AbstractVector}
nthreads::Int
end

function FixedEffectLinearMapCUDA{T}(fes::Vector{<:FixedEffect}, nthreads) where {T}
fes = [_cu(T, fe) for fe in fes]
scales = [CUDA.zeros(T, fe.n) for fe in fes]
caches = [CUDA.zeros(T, length(fes[1].interaction)) for fe in fes]
return FixedEffectLinearMapCUDA{T}(fes, scales, caches, nthreads)
end

function FixedEffects.gather!(fecoef::CuVector, refs::CuVector, α::Number, y::CuVector, cache::CuVector, nthreads::Integer)
nblocks = cld(length(y), nthreads)
@cuda threads=nthreads blocks=nblocks gather_kernel!(fecoef, refs, α, y, cache)
end

function gather_kernel!(fecoef, refs, α, y, cache)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
@inbounds for i = index:stride:length(y)
CUDA.atomic_add!(pointer(fecoef, refs[i]), α * y[i] * cache[i])
end
end

function FixedEffects.scatter!(y::CuVector, α::Number, fecoef::CuVector, refs::CuVector, cache::CuVector, nthreads::Integer)
nblocks = cld(length(y), nthreads)
@cuda threads=nthreads blocks=nblocks scatter_kernel!(y, α, fecoef, refs, cache)
end

function scatter_kernel!(y, α, fecoef, refs, cache)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
@inbounds for i = index:stride:length(y)
y[i] += α * fecoef[refs[i]] * cache[i]
end
end

##############################################################################
##
## Implement AbstractFixedEffectSolver interface
##
##############################################################################

mutable struct FixedEffectSolverCUDA{T} <: FixedEffects.AbstractFixedEffectSolver{T}
m::FixedEffectLinearMapCUDA{T}
weights::CuVector{T}
b::CuVector{T}
r::CuVector{T}
x::FixedEffectCoefficients{<: AbstractVector{T}}
v::FixedEffectCoefficients{<: AbstractVector{T}}
h::FixedEffectCoefficients{<: AbstractVector{T}}
hbar::FixedEffectCoefficients{<: AbstractVector{T}}
tmp::Vector{T} # used to convert AbstractVector to Vector{T}
fes::Vector{<:FixedEffect}
end

function FixedEffects.AbstractFixedEffectSolver{T}(fes::Vector{<:FixedEffect}, weights::AbstractWeights, ::Type{Val{:gpu}}, nthreads = 256) where {T}
Base.depwarn("The method :gpu is deprecated. Use either :CUDA or :Metal")
AbstractFixedEffectSolver{T}(fes, weights, Val(:CUDA), nthreads)
end

function FixedEffects.AbstractFixedEffectSolver{T}(fes::Vector{<:FixedEffect}, weights::AbstractWeights, ::Type{Val{:CUDA}}, nthreads = 256) where {T}
m = FixedEffectLinearMapCUDA{T}(fes, nthreads)
b = CUDA.zeros(T, length(weights))
r = CUDA.zeros(T, length(weights))
x = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
v = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
h = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
hbar = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
tmp = zeros(T, length(weights))
feM = FixedEffectSolverCUDA{T}(m, CUDA.zeros(T, length(weights)), b, r, x, v, h, hbar, tmp, fes)
FixedEffects.update_weights!(feM, weights)
end

function FixedEffects.update_weights!(feM::FixedEffectSolverCUDA{T}, weights::AbstractWeights) where {T}
copyto!(feM.weights, _cu(T, weights))
for (scale, fe) in zip(feM.m.scales, feM.m.fes)
scale!(scale, fe.refs, fe.interaction, feM.weights, feM.m.nthreads)
end
for (cache, scale, fe) in zip(feM.m.caches, feM.m.scales, feM.m.fes)
cache!(cache, fe.refs, fe.interaction, feM.weights, scale, feM.m.nthreads)
end
return feM
end

function scale!(scale::CuVector, refs::CuVector, interaction::CuVector, weights::CuVector, nthreads::Integer)
nblocks = cld(length(refs), nthreads)
fill!(scale, 0)
@cuda threads=nthreads blocks=nblocks scale_kernel!(scale, refs, interaction, weights)
map!(x -> x > 0 ? 1 / sqrt(x) : 0, scale, scale)
end

function scale_kernel!(scale, refs, interaction, weights)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
@inbounds for i = index:stride:length(interaction)
CUDA.atomic_add!(pointer(scale, refs[i]), abs2(interaction[i]) * weights[i])
end
end

function cache!(cache::CuVector, refs::CuVector, interaction::CuVector, weights::CuVector, scale::CuVector, nthreads::Integer)
nblocks = cld(length(cache), nthreads)
@cuda threads=nthreads blocks=nblocks cache!_kernel!(cache, refs, interaction, weights, scale)
end

function cache!_kernel!(cache, refs, interaction, weights, scale)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
@inbounds for i = index:stride:length(cache)
cache[i] = interaction[i] * sqrt(weights[i]) * scale[refs[i]]
end
end



end
Loading