JuliaGraphs · CarloLucibello · Feb 24, 2022 · Feb 24, 2022 · Feb 24, 2022 · Feb 24, 2022
diff --git a/perf/perf.jl b/perf/perf.jl
@@ -1,17 +1,30 @@
 using Flux, GraphNeuralNetworks, Graphs, BenchmarkTools, CUDA
 using DataFrames, Statistics, JLD2, SparseArrays
-CUDA.device!(2)
+using Unitful
+# CUDA.device!(2)
 CUDA.allowscalar(false)
 
-BenchmarkTools.ratio(::Missing, x) = Inf
-BenchmarkTools.ratio(x, ::Missing) = 0.0
-BenchmarkTools.ratio(::Missing, ::Missing) = missing
+function getres(res, str)
+    ismissing(res[str]) && return missing 
+    t = median(res[str]).time
+    if t < 1e3
+        t * u"ns"
+    elseif t < 1e6
+        t / 1e3 * u"μs"
+    elseif t < 1e9
+        t / 1e6 * u"ms"
+    else
+        t / 1e9 * u"s"
+    end
+end
 
 function run_single_benchmark(N, c, D, CONV; gtype=:lg)
-    data = erdos_renyi(N, c / (N-1), seed=17)
     X = randn(Float32, D, N)
-
+
+    data = erdos_renyi(N, c / (N-1), seed=17)
     g = GNNGraph(data; ndata=X, graph_type=gtype)
+
+    # g = rand_graph(N, c*N; ndata=X, graph_type=gtype)
     g_gpu = g |> gpu    
 
     m = CONV(D => D)
@@ -58,11 +71,12 @@ function run_benchmarks(;
         c = 6,
         D = 100,
         layers = [GCNConv, GATConv],
-        gtypes = [:coo, :sparse, :dense],
+        gtypes = [:coo],
         )
 
-    df = DataFrame(N=Int[], c=Float64[], layer=String[], gtype=Symbol[], 
-                   time_cpu=Any[], time_gpu=Any[]) |> allowmissing
+    df = DataFrame(N=Int[], c=Int[], layer=String[], gtype=Symbol[], 
+                   time_fwd_cpu=Any[], time_fwd_gpu=Any[],
+                   time_grad_cpu=Any[], time_grad_gpu=Any[])
 
     for gtype in gtypes
         for N in Ns
@@ -73,34 +87,37 @@ function run_benchmarks(;
                         N = N,
                         c = c,
                         gtype = gtype, 
-                        time_cpu = ismissing(res["CPU"]) ? missing : median(res["CPU"]),
-                        time_gpu = ismissing(res["GPU"]) ? missing : median(res["GPU"]),
+                        time_fwd_cpu = getres(res, "CPU_FWD"),
+                        time_fwd_gpu = getres(res, "GPU_FWD"),
+                        time_grad_cpu = getres(res, "CPU_GRAD"),
+                        time_grad_gpu = getres(res, "GPU_GRAD"),
                     )
                 push!(df, row)
+                println(row)
             end
         end
     end
 
-    df.gpu_to_cpu = ratio.(df.time_gpu, df.time_cpu)
+    df.grad_gpu_to_cpu = NoUnits.(df.time_grad_gpu ./ df.time_grad_cpu)
     sort!(df, [:layer, :N, :c, :gtype])
     return df
 end
 
-# df = run_benchmarks()
-# for g in groupby(df, :layer); println(g, "\n"); end
+df = run_benchmarks()
+for g in groupby(df, :layer); println(g, "\n"); end
 
-# @save "perf/perf_master_20210803_carlo.jld2" dfmaster=df
+# @save "master_2021_11_01_arrakis.jld2" dfmaster=df
 ## or
-# @save "perf/perf_pr.jld2" dfpr=df
+# @save "pr.jld2" dfpr=df
 
 
 function compare(dfpr, dfmaster; on=[:N, :c, :gtype, :layer])
     df = outerjoin(dfpr, dfmaster; on=on, makeunique=true, renamecols = :_pr => :_master)
-    df.pr_to_master_cpu = ratio.(df.time_cpu_pr, df.time_cpu_master)
-    df.pr_to_master_gpu = ratio.(df.time_gpu_pr, df.time_gpu_master) 
+    df.pr_to_master_cpu = df.time_cpu_pr ./ df.time_cpu_master
+    df.pr_to_master_gpu = df.time_gpu_pr ./ df.time_gpu_master 
     return df[:,[:N, :c, :gtype, :layer, :pr_to_master_cpu, :pr_to_master_gpu]]
 end
 
 # @load "perf/perf_pr.jld2" dfpr
 # @load "perf/perf_master.jld2" dfmaster
-# compare(dfpr, dfmaster)
+# compare(dfpr, dfmaster)
diff --git a/src/GNNGraphs/convert.jl b/src/GNNGraphs/convert.jl
@@ -81,20 +81,16 @@ to_dense(A::AbstractSparseMatrix, x...; kws...) = to_dense(collect(A), x...; kws
 
 function to_dense(A::ADJMAT_T, T=nothing; dir=:out, num_nodes=nothing, weighted=true)
     @assert dir ∈ [:out, :in]
-    T = T === nothing ? eltype(A) : T
     num_nodes = size(A, 1)
     @assert num_nodes == size(A, 2)
-    # @assert all(x -> (x == 1) || (x == 0), A)
     num_edges = numnonzeros(A)
     if dir == :in
         A = A'
     end
-    if T != eltype(A)
-        A = T.(A)
-    end
     if !weighted
-        A = map(x -> ifelse(x > 0, T(1), T(0)), A)
+        A = binarize(A)
     end
+    A = convert_eltype(T, A)
     return A, num_nodes, num_edges
 end
 
@@ -128,10 +124,7 @@ function to_dense(coo::COO_T, T=nothing; dir=:out, num_nodes=nothing, weighted=t
     if val === nothing || !weighted  
         val = ones_like(s, T)            
     end
-    if eltype(val) != T
-        val = T.(val)
-    end
-
+    val = convert_eltype(T, val)
     idxs = s .+ n .* (t .- 1) 
 
     ## using scatter instead of indexing since there could be multiple edges
@@ -149,20 +142,17 @@ function to_sparse(A::ADJMAT_T, T=nothing; dir=:out, num_nodes=nothing, weighted
     @assert dir ∈ [:out, :in]
     num_nodes = size(A, 1)
     @assert num_nodes == size(A, 2)
-    T = T === nothing ? eltype(A) : T
-    num_edges = A isa AbstractSparseMatrix ? nnz(A) : count(!=(0), A)
     if dir == :in
         A = A'
     end
-    if T != eltype(A)
-        A = T.(A)
-    end
     if !(A isa AbstractSparseMatrix)
-        A = sparse(A)
+        A = _sparse(A)
     end
     if !weighted
-        A = map(x -> ifelse(x > 0, T(1), T(0)), A)
+        A = binarize(A)
     end
+    A = convert_eltype(T, A)
+    num_edges = nnz(A)
     return A, num_nodes, num_edges
 end
 
@@ -180,10 +170,8 @@ function to_sparse(coo::COO_T, T=nothing; dir=:out, num_nodes=nothing, weighted=
     end
 
     num_nodes::Int = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes 
-    A = sparse(s, t, eweight, num_nodes, num_nodes)
+    A = _sparse(s, t, eweight, num_nodes, num_nodes)
     num_edges::Int = nnz(A)
-    if eltype(A) != T
-        A = T.(A)
-    end
+    A = convert_eltype(T, A)
     return A, num_nodes, num_edges
 end
diff --git a/src/GNNGraphs/query.jl b/src/GNNGraphs/query.jl
@@ -131,35 +131,32 @@ adjacency_list(g::GNNGraph; dir=:out) = adjacency_list(g, 1:g.num_nodes; dir)
 
 
 """
-    adjacency_matrix(g::GNNGraph, T=eltype(g); dir=:out, weighted=true)
+    adjacency_matrix(g::GNNGraph, [T]; dir=:out, weighted=true)
 
 Return the adjacency matrix `A` for the graph `g`. 
 
 If `dir=:out`, `A[i,j] > 0` denotes the presence of an edge from node `i` to node `j`.
 If `dir=:in` instead, `A[i,j] > 0` denotes the presence of an edge from node `j` to node `i`.
 
-User may specify the eltype `T` of the returned matrix. 
+The user can specify the eltype `T` of the returned matrix. 
 
 If `weighted=true`, the `A` will contain the edge weigths if any, otherwise the elements of `A` will be either 0 or 1.
 """
-function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType=eltype(g); dir=:out, weighted=true)
-    if g.graph[1] isa CuVector
-        # TODO revisit after https://github.com/JuliaGPU/CUDA.jl/pull/1152
-        A, n, m = to_dense(g.graph, T; num_nodes=g.num_nodes, weighted)
-    else
-        A, n, m = to_sparse(g.graph, T; num_nodes=g.num_nodes, weighted)
-    end
-    @assert size(A) == (n, n)
+function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::TT=eltype(g); dir=:out, weighted=true) where 
+        {TT <: Union{DataType}}
+    A, num_nodes, num_edges = to_sparse(g.graph, T; num_nodes=g.num_nodes, weighted)
+    @assert size(A) == (num_nodes, num_nodes)
     return dir == :out ? A : A'
 end
 
-function Graphs.adjacency_matrix(g::GNNGraph{<:ADJMAT_T}, T::DataType=eltype(g); dir=:out, weighted=true)
+function Graphs.adjacency_matrix(g::GNNGraph{<:ADJMAT_T}, T::TT=eltype(g); dir=:out, weighted=true) where
+        {TT <: Union{DataType}}
     @assert dir ∈ [:in, :out]
     A = g.graph
     if !weighted
         A = binarize(A)
     end
-    A = T != eltype(A) ? T.(A) : A
+    A = convert_eltype(T, A)
     return dir == :out ? A : A'
 end
 
@@ -177,7 +174,7 @@ function _get_edge_weight(g, edge_weight)
 end
 
 """
-    degree(g::GNNGraph, T=nothing; dir=:out, edge_weight=true)
+    degree(g::GNNGraph, [T]; dir=:out, edge_weight=true)
 
 Return a vector containing the degrees of the nodes in `g`.
 
@@ -234,7 +231,7 @@ function Graphs.degree(g::GNNGraph{<:ADJMAT_T}, T::TT=nothing; dir=:out, edge_we
     if edge_weight === false
         A = binarize(A)
     end
-    A = eltype(A) != T ? T.(A) : A
+    A = convert_eltype(T, A)
     return dir == :out ? vec(sum(A, dims=2)) : 
            dir == :in  ? vec(sum(A, dims=1)) :
                   vec(sum(A, dims=1)) .+ vec(sum(A, dims=2)) 

diff --git a/src/GNNGraphs/utils.jl b/src/GNNGraphs/utils.jl
@@ -98,6 +98,7 @@ ones_like(x, sz=size(x)) = ones_like(x, eltype(x), sz)
 numnonzeros(a::AbstractSparseMatrix) = nnz(a)
 numnonzeros(a::AbstractMatrix) = count(!=(0), a)
 
+
 # each edge is represented by a number in
 # 1:N^2
 function edge_encoding(s, t, n; directed=true)
@@ -151,11 +152,56 @@ end
 
 binarize(x) = map(>(0), x)
 
+@non_differentiable numnonzeros(x...)
 @non_differentiable binarize(x...)
 @non_differentiable edge_encoding(x...)
 @non_differentiable edge_decoding(x...)
 
+convert_eltype(::Nothing, x) = x
+convert_eltype(::Type{T}, x::AbstractArray{T}) where T = x
+convert_eltype(::Type{T}, x::AbstractArray) where T = T.(x)
+
+_sparse(x::AbstractMatrix) = sparse(x)
+_sparse(x::AbstractVector) = sparse(x)
+_sparse(s, t, w, m, n) = sparse(s, t, w, m, n)
+
+using CUDA.CUSPARSE: CuSparseMatrixCSR, AbstractCuSparseMatrix
+
+# This is working around 2 issues:
+# https://github.com/JuliaGPU/CUDA.jl/issues/1402
+# https://github.com/JuliaGPU/CUDA.jl/issues/1407
+function _sparse(s::AnyCuVector, t::AnyCuVector, w::AnyCuVector{T}, m, n) where T
+    p = sortperm(s) # issue CUDA#1407
+    s, t, w = s[p], t[p], w[p]
+    T.(sparse(s, t, Float32.(w), m, n))
+end
+
+# TODO https://github.com/JuliaGPU/CUDA.jl/issues/1403
+Base.:*(x::AnyCuMatrix, y::AbstractCuSparseMatrix) = (y' * x')' |> CuMatrix
+
+# Workaround https://github.com/JuliaGPU/CUDA.jl/issues/1406
+Base.sum(x::AbstractCuSparseMatrix; dims=:) = cusparse_sum(x, Val(dims))
 
+cusparse_sum(x, ::Val{:}) = sum(cusparse_sum(x, Val(1)))
+
+function cusparse_sum(x::AbstractCuSparseMatrix, ::Val{1})
+    m, n = size(x)
+    v = ones_like(x, (1, m))
+    return v * x
+end
+
+function cusparse_sum(x::AbstractCuSparseMatrix, ::Val{2})
+    m, n = size(x)
+    v = ones_like(x, (n, 1))
+    return x * v
+end
+
+# # TODO remove this piracy when this is merged
+# # https://github.com/JuliaGPU/CUDA.jl/pull/1401
+# function CUDA.cu(x::SparseMatrixCSC)
+#     # Avoid casting to CuSparseMatrixCSC since it is not well supported
+#     CuSparseMatrixCSR(x)
+# end
 
 ####################################
 # FROM MLBASE.jl
@@ -214,4 +260,5 @@ function getobs!(buffers::Union{Tuple, NamedTuple},
                 getobs!(buffer, x, indices)
             end
 end
-#######################################################
+#######################################################
+
diff --git a/src/msgpass.jl b/src/msgpass.jl
@@ -189,11 +189,6 @@ function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstract
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
-    propagate((xi,xj,e) -> copy_xj(xi,xj,e), g, +, xi, xj, e)
-end
-
 ## E_MUL_XJ 
 
 # for weighted convolution
@@ -203,11 +198,6 @@ function propagate(::typeof(e_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(e_mul_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e::AbstractVector)
-    propagate((xi,xj,e) -> e_mul_xj(xi,xj,e), g, +, xi, xj, e)
-end
-
 ## W_MUL_XJ 
 
 # for weighted convolution
@@ -216,11 +206,6 @@ function propagate(::typeof(w_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(w_mul_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e::Nothing)
-    propagate((xi,xj,e) -> w_mul_xj(xi,xj,e), g, +, xi, xj, e)
-end
-