FixedEffects · eloualiche · Jul 21, 2023 · Jan 21, 2023 · Feb 20, 2023 · Feb 20, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,7 +15,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.3' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'.
+          - '1.9' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'.
           - '1' # Leave this line unchanged. '1' will automatically expand to the latest stable 1.x release of Julia.
         os:
           - ubuntu-latest

diff --git a/Project.toml b/Project.toml
@@ -1,25 +1,33 @@
 name = "FixedEffects"
 uuid = "c8885935-8500-56a7-9867-7708b20db0eb"
-version = "2.1.2"
+version = "2.2.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 GroupedArrays = "6407cd72-fade-4a84-8a1e-56e431fc1533"
 
 [compat]
-Requires = "1"
 StatsBase = "0.33, 0.34"
 GroupedArrays = "0.3"
-julia = "1.3"
+julia = "1.9"
 
-[extras]
+[extensions]
+CUDAExt = "CUDA"
+MetalExt = "Metal"
+
+[weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+
+[extras]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "CUDA", "PooledArrays", "CategoricalArrays"]
+test = ["CategoricalArrays", "CUDA", "Metal", "PooledArrays", "Test"]
+
diff --git a/benchmarks/.sublime2Terminal.jl b/benchmarks/.sublime2Terminal.jl
@@ -1,11 +1 @@
-using FixedEffects, Random, Statistics
-Random.seed!(1234)
-N = 10000000
-K = 100
-id1 = rand(1:div(N, K), N)
-id2 = rand(1:K, N)
-fes = [FixedEffect(id1), FixedEffect(id2)]
-x = rand(N)
-
-# simple problem
-@time solve_residuals!(deepcopy(x), fes)
+@time solve_residuals!(deepcopy(x), fes)
diff --git a/benchmarks/benchmark_gpu.jl → benchmarks/benchmark_CUDA.jl b/benchmarks/benchmark_gpu.jl → benchmarks/benchmark_CUDA.jl
@@ -1,4 +1,4 @@
-using FixedEffects, CategoricalArrays, Random, CUDA, BenchmarkTools, Statistics
+using CUDA, FixedEffects, CategoricalArrays, Random, BenchmarkTools, Statistics
 
 Random.seed!(1234)
 N = 10000000
@@ -10,8 +10,8 @@ id3 = categorical(Int.(rand(1:(N/K), N)))
 x = rand(N)
 X = rand(N, 10)
 
-(r_g, it_g, conv_g)=solve_residuals!(copy(x), deepcopy([FixedEffect(id1), FixedEffect(id2)]), method = :lsmr_gpu)
-(r_c, it_c, conv_c)=solve_residuals!(copy(x), deepcopy([FixedEffect(id1), FixedEffect(id2)]), method = :lsmr);
+(r_g, it_g, conv_g)=solve_residuals!(copy(x), deepcopy([FixedEffect(id1), FixedEffect(id2)]), method = :CUDA)
+(r_c, it_c, conv_c)=solve_residuals!(copy(x), deepcopy([FixedEffect(id1), FixedEffect(id2)]));
 
 
 @show r_c ≈ r_g
@@ -20,19 +20,19 @@ X = rand(N, 10)
 
 
 println("CPU Float64, size(x)=$(size(x)), 2 fixed effects")
-@btime (r_c, it_c, conv_c)=solve_residuals!($x, [FixedEffect(id1), FixedEffect(id2)], method = :lsmr)
+@btime (r_c, it_c, conv_c)=solve_residuals!($x, [FixedEffect(id1), FixedEffect(id2)])
 println("CPU Float32, size(x)=$(size(x)), 2 fixed effects")
-@btime (r_c, it_c, conv_c)=solve_residuals!(Float32.(x), [FixedEffect(id1), FixedEffect(id2)], method = :lsmr)
+@btime (r_c, it_c, conv_c)=solve_residuals!(Float32.(x), [FixedEffect(id1), FixedEffect(id2)])
 println("GPU Float64, size(x)=$(size(x)), 2 fixed effects")
-@btime (r_g, it_g, conv_g)=solve_residuals!($x, [FixedEffect(id1), FixedEffect(id2)], method = :lsmr_gpu)
+@btime (r_g, it_g, conv_g)=solve_residuals!($x, [FixedEffect(id1), FixedEffect(id2)], method = :CUDA)
 println("GPU Float32, size(x)=$(size(x)), 2 fixed effects")
-@btime (r_g, it_g, conv_g)=solve_residuals!(Float32.(x), [FixedEffect(id1), FixedEffect(id2)], method = :lsmr_gpu)
+@btime (r_g, it_g, conv_g)=solve_residuals!(Float32.(x), [FixedEffect(id1), FixedEffect(id2)], method = :CUDA)
 
 
 # Note that most of the GPU time is spend transferring memory 
 using Profile
 Profile.clear()
-@profile solve_residuals!(x, [FixedEffect(id1), FixedEffect(id2)], method = :lsmr_gpu);
+@profile solve_residuals!(x, [FixedEffect(id1), FixedEffect(id2)], method = :CUDA);
 Profile.print(noisefloor=1.0)
 
 
@@ -64,13 +64,13 @@ println("CPU Float32")
 
 println("GPU Float64")
 @time (rg6,i,c)=solve_residuals!(copy(X), [FixedEffect(pid), FixedEffect(fid)],
-                                 method = :lsmr_gpu,
+                                 method = :CUDA,
                                  tol=sqrt(eps(Float32)));  
 @show i
 
 println("GPU Float32")
 @time (rg3,i,c)=solve_residuals!(Float32.(X), [FixedEffect(pid), FixedEffect(fid)],
-                                 method = :lsmr_gpu, 
+                                 method = :CUDA, 
                                  tol=sqrt(eps(Float32)));
 @show i
 @show mean(rg3)

diff --git a/benchmarks/benchmark_Metal.jl b/benchmarks/benchmark_Metal.jl
@@ -0,0 +1,39 @@
+using Revise, Metal, FixedEffects, Random, Statistics
+Random.seed!(1234)
+N = 10000000
+K = 100
+id1 = rand(1:div(N, K), N)
+id2 = rand(1:K, N)
+fes = [FixedEffect(id1), FixedEffect(id2)]
+x = Float32.(rand(N))
+
+# simple problem
+@time solve_residuals!(deepcopy(x), fes)
+#   0.654833 seconds (1.99 k allocations: 390.841 MiB, 3.71% gc time)
+@time solve_residuals!(deepcopy(x), fes; method = :Metal)
+#   0.298326 seconds (129.08 k allocations: 79.208 MiB)
+@time solve_residuals!([x x x x], fes)
+#   1.604061 seconds (1.25 M allocations: 416.364 MiB, 4.21% gc time, 30.57% compilation time)
+@time solve_residuals!([x x x x], fes; method = :Metal)
+#   0.790909 seconds (531.78 k allocations: 204.363 MiB, 3.19% compilation time)
+
+
+
+# More complicated problem
+N = 800000 # number of observations
+M = 400000 # number of workers
+O = 50000 # number of firms
+Random.seed!(1234)
+pid = rand(1:M, N)
+fid = [rand(max(1, div(x, 8)-10):min(O, div(x, 8)+10)) for x in pid]
+x = rand(N)
+fes = [FixedEffect(pid), FixedEffect(fid)]
+
+
+@time solve_residuals!([x x x x], fes; double_precision = false)
+#   8.294446 seconds (225.13 k allocations: 67.777 MiB, 0.11% gc time)
+
+@time solve_residuals!([x x x x], fes; double_precision = false, method = :Metal)
+#   1.605953 seconds (3.25 M allocations: 103.342 MiB, 1.82% gc time)
+
+
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
@@ -0,0 +1,153 @@
+module CUDAExt
+using FixedEffects, CUDA
+using FixedEffects: FixedEffectCoefficients, AbstractWeights, UnitWeights, LinearAlgebra, Adjoint, mul!, rmul!,  lsmr!, AbstractFixedEffectLinearMap
+CUDA.allowscalar(false)
+
+##############################################################################
+##
+## Conversion FixedEffect between CPU and GPU
+##
+##############################################################################
+
+# https://github.com/JuliaGPU/CUDA.jl/issues/142
+function _cu(T::Type, fe::FixedEffect)
+	refs = CuArray(fe.refs)
+	interaction = _cu(T, fe.interaction)
+	FixedEffect{typeof(refs), typeof(interaction)}(refs, interaction, fe.n)
+end
+_cu(T::Type, w::UnitWeights) = fill!(CuVector{T}(undef, length(w)), w[1])
+_cu(T::Type, w::AbstractVector) = CuVector{T}(convert(Vector{T}, w))
+
+##############################################################################
+##
+## FixedEffectLinearMap on the GPU (code by Paul Schrimpf)
+##
+## Model matrix of categorical variables
+## mutiplied by diag(1/sqrt(∑w * interaction^2, ..., ∑w * interaction^2) (Jacobi preconditoner)
+##
+## We define these methods used in lsmr! (duck typing):
+## eltype
+## size
+## mul!
+##
+##############################################################################
+
+mutable struct FixedEffectLinearMapCUDA{T} <: AbstractFixedEffectLinearMap{T}
+	fes::Vector{<:FixedEffect}
+	scales::Vector{<:AbstractVector}
+	caches::Vector{<:AbstractVector}
+	nthreads::Int
+end
+
+function FixedEffectLinearMapCUDA{T}(fes::Vector{<:FixedEffect}, nthreads) where {T}
+	fes = [_cu(T, fe) for fe in fes]
+	scales = [CUDA.zeros(T, fe.n) for fe in fes]
+	caches = [CUDA.zeros(T, length(fes[1].interaction)) for fe in fes]
+	return FixedEffectLinearMapCUDA{T}(fes, scales, caches, nthreads)
+end
+
+function FixedEffects.gather!(fecoef::CuVector, refs::CuVector, α::Number, y::CuVector, cache::CuVector, nthreads::Integer)
+	nblocks = cld(length(y), nthreads) 
+	@cuda threads=nthreads blocks=nblocks gather_kernel!(fecoef, refs, α, y, cache)    
+end
+
+function gather_kernel!(fecoef, refs, α, y, cache)
+	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+	stride = blockDim().x * gridDim().x
+	@inbounds for i = index:stride:length(y)
+		CUDA.atomic_add!(pointer(fecoef, refs[i]), α * y[i] * cache[i])
+	end
+end
+
+function FixedEffects.scatter!(y::CuVector, α::Number, fecoef::CuVector, refs::CuVector, cache::CuVector, nthreads::Integer)
+	nblocks = cld(length(y), nthreads)
+	@cuda threads=nthreads blocks=nblocks scatter_kernel!(y, α, fecoef, refs, cache)
+end
+
+function scatter_kernel!(y, α, fecoef, refs, cache)
+	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+	stride = blockDim().x * gridDim().x
+	@inbounds for i = index:stride:length(y)
+		y[i] += α * fecoef[refs[i]] * cache[i]
+	end
+end
+
+##############################################################################
+##
+## Implement AbstractFixedEffectSolver interface
+##
+##############################################################################
+
+mutable struct FixedEffectSolverCUDA{T} <: FixedEffects.AbstractFixedEffectSolver{T}
+	m::FixedEffectLinearMapCUDA{T}
+	weights::CuVector{T}
+	b::CuVector{T}
+	r::CuVector{T}
+	x::FixedEffectCoefficients{<: AbstractVector{T}}
+	v::FixedEffectCoefficients{<: AbstractVector{T}}
+	h::FixedEffectCoefficients{<: AbstractVector{T}}
+	hbar::FixedEffectCoefficients{<: AbstractVector{T}}
+	tmp::Vector{T} # used to convert AbstractVector to Vector{T}
+	fes::Vector{<:FixedEffect}
+end
+
+function FixedEffects.AbstractFixedEffectSolver{T}(fes::Vector{<:FixedEffect}, weights::AbstractWeights, ::Type{Val{:gpu}}, nthreads = 256) where {T}
+	Base.depwarn("The method :gpu is deprecated. Use either :CUDA or :Metal")
+	AbstractFixedEffectSolver{T}(fes, weights, Val(:CUDA), nthreads)
+end
+
+function FixedEffects.AbstractFixedEffectSolver{T}(fes::Vector{<:FixedEffect}, weights::AbstractWeights, ::Type{Val{:CUDA}}, nthreads = 256) where {T}
+	m = FixedEffectLinearMapCUDA{T}(fes, nthreads)
+	b = CUDA.zeros(T, length(weights))
+	r = CUDA.zeros(T, length(weights))
+	x = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
+	v = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
+	h = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
+	hbar = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
+	tmp = zeros(T, length(weights))
+	feM = FixedEffectSolverCUDA{T}(m, CUDA.zeros(T, length(weights)), b, r, x, v, h, hbar, tmp, fes)
+	FixedEffects.update_weights!(feM, weights)
+end
+
+function FixedEffects.update_weights!(feM::FixedEffectSolverCUDA{T}, weights::AbstractWeights) where {T}
+	copyto!(feM.weights, _cu(T, weights))
+	for (scale, fe) in zip(feM.m.scales, feM.m.fes)
+		scale!(scale, fe.refs, fe.interaction, feM.weights, feM.m.nthreads)
+	end
+	for (cache, scale, fe) in zip(feM.m.caches, feM.m.scales, feM.m.fes)
+		cache!(cache, fe.refs, fe.interaction, feM.weights, scale, feM.m.nthreads)
+	end	
+	return feM
+end
+
+function scale!(scale::CuVector, refs::CuVector, interaction::CuVector, weights::CuVector, nthreads::Integer)
+	nblocks = cld(length(refs), nthreads) 
+    fill!(scale, 0)
+	@cuda threads=nthreads blocks=nblocks scale_kernel!(scale, refs, interaction, weights)
+	map!(x -> x > 0 ? 1 / sqrt(x) : 0, scale, scale)
+end
+
+function scale_kernel!(scale, refs, interaction, weights)
+	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+	stride = blockDim().x * gridDim().x
+	@inbounds for i = index:stride:length(interaction)
+		CUDA.atomic_add!(pointer(scale, refs[i]), abs2(interaction[i]) * weights[i])
+	end
+end
+
+function cache!(cache::CuVector, refs::CuVector, interaction::CuVector, weights::CuVector, scale::CuVector, nthreads::Integer)
+	nblocks = cld(length(cache), nthreads) 
+	@cuda threads=nthreads blocks=nblocks cache!_kernel!(cache, refs, interaction, weights, scale)
+end
+
+function cache!_kernel!(cache, refs, interaction, weights, scale)
+	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+	stride = blockDim().x * gridDim().x
+	@inbounds for i = index:stride:length(cache)
+		cache[i] = interaction[i] * sqrt(weights[i]) * scale[refs[i]]
+	end
+end
+
+
+
+end