diff --git a/perf/Project.toml b/perf/Project.toml index ddbb1be6e..825ed5a2b 100644 --- a/perf/Project.toml +++ b/perf/Project.toml @@ -1,6 +1,8 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694" +Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" -Graphs = "093fc24a-ae57-5d10-9952-331d41423f4d" +Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d" diff --git a/perf/master_2021_11_01_arrakis.jld2 b/perf/master_2021_11_01_arrakis.jld2 new file mode 100644 index 000000000..6d058a249 Binary files /dev/null and b/perf/master_2021_11_01_arrakis.jld2 differ diff --git a/perf/perf.jl b/perf/perf.jl index dfebc0de2..04dec42fe 100644 --- a/perf/perf.jl +++ b/perf/perf.jl @@ -1,17 +1,30 @@ using Flux, GraphNeuralNetworks, Graphs, BenchmarkTools, CUDA using DataFrames, Statistics, JLD2, SparseArrays -CUDA.device!(2) +using Unitful +# CUDA.device!(2) CUDA.allowscalar(false) -BenchmarkTools.ratio(::Missing, x) = Inf -BenchmarkTools.ratio(x, ::Missing) = 0.0 -BenchmarkTools.ratio(::Missing, ::Missing) = missing +function getres(res, str) + ismissing(res[str]) && return missing + t = median(res[str]).time + if t < 1e3 + t * u"ns" + elseif t < 1e6 + t / 1e3 * u"μs" + elseif t < 1e9 + t / 1e6 * u"ms" + else + t / 1e9 * u"s" + end +end function run_single_benchmark(N, c, D, CONV; gtype=:lg) - data = erdos_renyi(N, c / (N-1), seed=17) X = randn(Float32, D, N) - + + data = erdos_renyi(N, c / (N-1), seed=17) g = GNNGraph(data; ndata=X, graph_type=gtype) + + # g = rand_graph(N, c*N; ndata=X, graph_type=gtype) g_gpu = g |> gpu m = CONV(D => D) @@ -58,11 +71,12 @@ function run_benchmarks(; c = 6, D = 100, layers = [GCNConv, GATConv], - gtypes = [:coo, :sparse, :dense], + gtypes = [:coo], ) - df = DataFrame(N=Int[], c=Float64[], layer=String[], gtype=Symbol[], - time_cpu=Any[], time_gpu=Any[]) |> allowmissing + df = DataFrame(N=Int[], c=Int[], layer=String[], gtype=Symbol[], + time_fwd_cpu=Any[], time_fwd_gpu=Any[], + time_grad_cpu=Any[], time_grad_gpu=Any[]) for gtype in gtypes for N in Ns @@ -73,31 +87,34 @@ function run_benchmarks(; N = N, c = c, gtype = gtype, - time_cpu = ismissing(res["CPU"]) ? missing : median(res["CPU"]), - time_gpu = ismissing(res["GPU"]) ? missing : median(res["GPU"]), + time_fwd_cpu = getres(res, "CPU_FWD"), + time_fwd_gpu = getres(res, "GPU_FWD"), + time_grad_cpu = getres(res, "CPU_GRAD"), + time_grad_gpu = getres(res, "GPU_GRAD"), ) push!(df, row) + println(row) end end end - df.gpu_to_cpu = ratio.(df.time_gpu, df.time_cpu) + df.grad_gpu_to_cpu = NoUnits.(df.time_grad_gpu ./ df.time_grad_cpu) sort!(df, [:layer, :N, :c, :gtype]) return df end -# df = run_benchmarks() -# for g in groupby(df, :layer); println(g, "\n"); end +df = run_benchmarks() +for g in groupby(df, :layer); println(g, "\n"); end -# @save "perf/perf_master_20210803_carlo.jld2" dfmaster=df +# @save "master_2021_11_01_arrakis.jld2" dfmaster=df ## or -# @save "perf/perf_pr.jld2" dfpr=df +# @save "pr.jld2" dfpr=df function compare(dfpr, dfmaster; on=[:N, :c, :gtype, :layer]) df = outerjoin(dfpr, dfmaster; on=on, makeunique=true, renamecols = :_pr => :_master) - df.pr_to_master_cpu = ratio.(df.time_cpu_pr, df.time_cpu_master) - df.pr_to_master_gpu = ratio.(df.time_gpu_pr, df.time_gpu_master) + df.pr_to_master_cpu = df.time_cpu_pr ./ df.time_cpu_master + df.pr_to_master_gpu = df.time_gpu_pr ./ df.time_gpu_master return df[:,[:N, :c, :gtype, :layer, :pr_to_master_cpu, :pr_to_master_gpu]] end diff --git a/perf/pr_2021_11_01_arrakis.jld2 b/perf/pr_2021_11_01_arrakis.jld2 new file mode 100644 index 000000000..c55c19974 Binary files /dev/null and b/perf/pr_2021_11_01_arrakis.jld2 differ diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl index 51e8891c6..7bffa6686 100644 --- a/src/GNNGraphs/GNNGraphs.jl +++ b/src/GNNGraphs/GNNGraphs.jl @@ -3,6 +3,7 @@ module GNNGraphs using SparseArrays using Functors: @functor using CUDA +using CUDA.CUSPARSE import Graphs using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree import Flux diff --git a/src/GNNGraphs/convert.jl b/src/GNNGraphs/convert.jl index 09f0de586..2036432d1 100644 --- a/src/GNNGraphs/convert.jl +++ b/src/GNNGraphs/convert.jl @@ -137,11 +137,39 @@ function to_sparse(coo::COO_T, T::DataType=Int; dir=:out, num_nodes=nothing) s, t, eweight = coo eweight = isnothing(eweight) ? fill!(similar(s, T), 1) : eweight num_nodes = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes - A = sparse(s, t, eweight, num_nodes, num_nodes) + A = _sparse(s, t, eweight, num_nodes, num_nodes) num_edges = length(s) return A, num_nodes, num_edges end +_sparse(s, t, eweight, n, m) = sparse(s, t, eweight, n, m) + +function _sparse(I::CuVector, J::CuVector, V::CuVector, m, n) + spcoo = CuSparseMatrixCOO{Float32, Int32}(Int32.(I), Int32.(J), Float32.(V), (m, n)) + return CuSparseMatrixCSR(spcoo) +end + +# function _sparse(I::CuVector, J::CuVector, V::CuVector, m, n; fmt=:csr) +# # Tv = Int32 +# spcoo = CuSparseMatrixCOO{Float32, Int32}(Int32.(I), Int32.(J), Float32.(V), (m, n)) +# if fmt == :csc +# return CuSparseMatrixCSC(spcoo) +# elseif fmt == :csr +# return CuSparseMatrixCSR(spcoo) +# elseif fmt == :coo +# return spcoo +# else +# error("Format :$fmt not available, use :csc, :csr, or :coo.") +# end +# end + + +# Workaround for https://github.com/JuliaGPU/CUDA.jl/issues/1113#issuecomment-955759875 +function Base.:*(A::CuMatrix, B::CuSparseMatrixCSR) + @assert size(A, 2) == size(B, 1) + return CuMatrix((B' * A')') +end + @non_differentiable to_coo(x...) @non_differentiable to_dense(x...) diff --git a/src/GNNGraphs/generate.jl b/src/GNNGraphs/generate.jl index 67f5d1f1b..9f0eeb5d2 100644 --- a/src/GNNGraphs/generate.jl +++ b/src/GNNGraphs/generate.jl @@ -1,5 +1,5 @@ """ - rand_graph(n, m; bidirected=true, kws...) + rand_graph(n, m; bidirected=true, seed=-1, kws...) Generate a random (Erdós-Renyi) `GNNGraph` with `n` nodes and `m` edges. @@ -43,10 +43,10 @@ julia> edge_index(g) ``` """ -function rand_graph(n::Integer, m::Integer; bidirected=true, kws...) +function rand_graph(n::Integer, m::Integer; bidirected=true, seed=-1, kws...) if bidirected @assert iseven(m) "Need even number of edges for bidirected graphs, given m=$m." end m2 = bidirected ? m÷2 : m - return GNNGraph(Graphs.erdos_renyi(n, m2, is_directed=!bidirected); kws...) + return GNNGraph(Graphs.erdos_renyi(n, m2; is_directed=!bidirected, seed); kws...) end diff --git a/src/GNNGraphs/gnngraph.jl b/src/GNNGraphs/gnngraph.jl index a0bf2440f..019586f05 100644 --- a/src/GNNGraphs/gnngraph.jl +++ b/src/GNNGraphs/gnngraph.jl @@ -192,8 +192,8 @@ function GNNGraph(g::GNNGraph; ndata=g.ndata, edata=g.edata, gdata=g.gdata, grap ndata, edata, gdata) end -function Base.show(io::IO, g::GNNGraph) - println(io, "GNNGraph: +function Base.show(io::IO, g::GNNGraph{T}) where T + println(io, "GNNGraph{$T}: num_nodes = $(g.num_nodes) num_edges = $(g.num_edges) num_graphs = $(g.num_graphs)") diff --git a/src/GNNGraphs/query.jl b/src/GNNGraphs/query.jl index bb4fb6f29..5e580f284 100644 --- a/src/GNNGraphs/query.jl +++ b/src/GNNGraphs/query.jl @@ -74,12 +74,7 @@ function adjacency_list(g::GNNGraph; dir=:out) end function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType=Int; dir=:out) - if g.graph[1] isa CuVector - # TODO revisit after https://github.com/JuliaGPU/CUDA.jl/pull/1152 - A, n, m = to_dense(g.graph, T, num_nodes=g.num_nodes) - else - A, n, m = to_sparse(g.graph, T, num_nodes=g.num_nodes) - end + A, n, m = to_sparse(g.graph, T, num_nodes=g.num_nodes) @assert size(A) == (n, n) return dir == :out ? A : A' end diff --git a/src/msgpass.jl b/src/msgpass.jl index 1611ebe56..93411261f 100644 --- a/src/msgpass.jl +++ b/src/msgpass.jl @@ -152,11 +152,32 @@ function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(+), xi, xj::AbstractM return xj * A end -## avoid the fast path on gpu until we have better cuda support -function propagate(::typeof(copyxj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e) - propagate((xi,xj,e)->copyxj(xi,xj,e), g, +, xi, xj, e) +# Have to define custom rule since CUDA.jl has troubles with some sparse-dense multiplications +function ChainRulesCore.rrule(::typeof(propagate), ::typeof(copyxj), g::GNNGraph, + ::typeof(+), xi, xj::AbstractMatrix, e) + A = adjacency_matrix(g) + y = xj * A + function propagate_pullback(ȳ) + Ȳ = unthunk(ȳ) + dxj = Ȳ * A' + return NoTangent(), NoTangent(), NoTangent(), NoTangent(), NoTangent(), dxj, NoTangent() + end + + function propagate_pullback(ȳ::CuMatrix) + Ȳ = unthunk(ȳ) + dxj = CuArray((A * Ȳ')') + return NoTangent(), NoTangent(), NoTangent(), NoTangent(), NoTangent(), dxj, NoTangent() + end + + y, propagate_pullback end + +# ## avoid the fast path on gpu until we have better cuda support +# function propagate(::typeof(copyxj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e) +# propagate((xi,xj,e) -> copyxj(xi,xj,e), g, +, xi, xj, e) +# end + # function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(mean), xi, xj::AbstractMatrix, e) # A = adjacency_matrix(g) # D = compute_degree(A)