diff --git a/perf/Project.toml b/perf/Project.toml
index ddbb1be6e..825ed5a2b 100644
--- a/perf/Project.toml
+++ b/perf/Project.toml
@@ -1,6 +1,8 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694"
+Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-Graphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
+Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
diff --git a/perf/master_2021_11_01_arrakis.jld2 b/perf/master_2021_11_01_arrakis.jld2
new file mode 100644
index 000000000..6d058a249
Binary files /dev/null and b/perf/master_2021_11_01_arrakis.jld2 differ
diff --git a/perf/perf.jl b/perf/perf.jl
index dfebc0de2..04dec42fe 100644
--- a/perf/perf.jl
+++ b/perf/perf.jl
@@ -1,17 +1,30 @@
 using Flux, GraphNeuralNetworks, Graphs, BenchmarkTools, CUDA
 using DataFrames, Statistics, JLD2, SparseArrays
-CUDA.device!(2)
+using Unitful
+# CUDA.device!(2)
 CUDA.allowscalar(false)
 
-BenchmarkTools.ratio(::Missing, x) = Inf
-BenchmarkTools.ratio(x, ::Missing) = 0.0
-BenchmarkTools.ratio(::Missing, ::Missing) = missing
+function getres(res, str)
+    ismissing(res[str]) && return missing 
+    t = median(res[str]).time
+    if t < 1e3
+        t * u"ns"
+    elseif t < 1e6
+        t / 1e3 * u"μs"
+    elseif t < 1e9
+        t / 1e6 * u"ms"
+    else
+        t / 1e9 * u"s"
+    end
+end
 
 function run_single_benchmark(N, c, D, CONV; gtype=:lg)
-    data = erdos_renyi(N, c / (N-1), seed=17)
     X = randn(Float32, D, N)
-    
+
+    data = erdos_renyi(N, c / (N-1), seed=17)
     g = GNNGraph(data; ndata=X, graph_type=gtype)
+    
+    # g = rand_graph(N, c*N; ndata=X, graph_type=gtype)
     g_gpu = g |> gpu    
     
     m = CONV(D => D)
@@ -58,11 +71,12 @@ function run_benchmarks(;
         c = 6,
         D = 100,
         layers = [GCNConv, GATConv],
-        gtypes = [:coo, :sparse, :dense],
+        gtypes = [:coo],
         )
 
-    df = DataFrame(N=Int[], c=Float64[], layer=String[], gtype=Symbol[], 
-                   time_cpu=Any[], time_gpu=Any[]) |> allowmissing
+    df = DataFrame(N=Int[], c=Int[], layer=String[], gtype=Symbol[], 
+                   time_fwd_cpu=Any[], time_fwd_gpu=Any[],
+                   time_grad_cpu=Any[], time_grad_gpu=Any[])
     
     for gtype in gtypes
         for N in Ns
@@ -73,31 +87,34 @@ function run_benchmarks(;
                         N = N,
                         c = c,
                         gtype = gtype, 
-                        time_cpu = ismissing(res["CPU"]) ? missing : median(res["CPU"]),
-                        time_gpu = ismissing(res["GPU"]) ? missing : median(res["GPU"]),
+                        time_fwd_cpu = getres(res, "CPU_FWD"),
+                        time_fwd_gpu = getres(res, "GPU_FWD"),
+                        time_grad_cpu = getres(res, "CPU_GRAD"),
+                        time_grad_gpu = getres(res, "GPU_GRAD"),
                     )
                 push!(df, row)
+                println(row)
             end
         end
     end
 
-    df.gpu_to_cpu = ratio.(df.time_gpu, df.time_cpu)
+    df.grad_gpu_to_cpu = NoUnits.(df.time_grad_gpu ./ df.time_grad_cpu)
     sort!(df, [:layer, :N, :c, :gtype])
     return df
 end
 
-# df = run_benchmarks()
-# for g in groupby(df, :layer); println(g, "\n"); end
+df = run_benchmarks()
+for g in groupby(df, :layer); println(g, "\n"); end
 
-# @save "perf/perf_master_20210803_carlo.jld2" dfmaster=df
+# @save "master_2021_11_01_arrakis.jld2" dfmaster=df
 ## or
-# @save "perf/perf_pr.jld2" dfpr=df
+# @save "pr.jld2" dfpr=df
 
 
 function compare(dfpr, dfmaster; on=[:N, :c, :gtype, :layer])
     df = outerjoin(dfpr, dfmaster; on=on, makeunique=true, renamecols = :_pr => :_master)
-    df.pr_to_master_cpu = ratio.(df.time_cpu_pr, df.time_cpu_master)
-    df.pr_to_master_gpu = ratio.(df.time_gpu_pr, df.time_gpu_master) 
+    df.pr_to_master_cpu = df.time_cpu_pr ./ df.time_cpu_master
+    df.pr_to_master_gpu = df.time_gpu_pr ./ df.time_gpu_master 
     return df[:,[:N, :c, :gtype, :layer, :pr_to_master_cpu, :pr_to_master_gpu]]
 end
 
diff --git a/perf/pr_2021_11_01_arrakis.jld2 b/perf/pr_2021_11_01_arrakis.jld2
new file mode 100644
index 000000000..c55c19974
Binary files /dev/null and b/perf/pr_2021_11_01_arrakis.jld2 differ
diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl
index 51e8891c6..7bffa6686 100644
--- a/src/GNNGraphs/GNNGraphs.jl
+++ b/src/GNNGraphs/GNNGraphs.jl
@@ -3,6 +3,7 @@ module GNNGraphs
 using SparseArrays
 using Functors: @functor
 using CUDA 
+using CUDA.CUSPARSE
 import Graphs
 using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree
 import Flux
diff --git a/src/GNNGraphs/convert.jl b/src/GNNGraphs/convert.jl
index 09f0de586..2036432d1 100644
--- a/src/GNNGraphs/convert.jl
+++ b/src/GNNGraphs/convert.jl
@@ -137,11 +137,39 @@ function to_sparse(coo::COO_T, T::DataType=Int; dir=:out, num_nodes=nothing)
     s, t, eweight  = coo
     eweight = isnothing(eweight) ? fill!(similar(s, T), 1) : eweight
     num_nodes = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes 
-    A = sparse(s, t, eweight, num_nodes, num_nodes)
+    A = _sparse(s, t, eweight, num_nodes, num_nodes)
     num_edges = length(s)
     return A, num_nodes, num_edges
 end
 
+_sparse(s, t, eweight, n, m) = sparse(s, t, eweight, n, m)
+
+function _sparse(I::CuVector, J::CuVector, V::CuVector, m, n)
+    spcoo = CuSparseMatrixCOO{Float32, Int32}(Int32.(I), Int32.(J), Float32.(V), (m, n))
+    return CuSparseMatrixCSR(spcoo)
+end
+
+# function _sparse(I::CuVector, J::CuVector, V::CuVector, m, n; fmt=:csr)
+#     # Tv = Int32
+#     spcoo = CuSparseMatrixCOO{Float32, Int32}(Int32.(I), Int32.(J), Float32.(V), (m, n))
+#     if fmt == :csc
+#         return CuSparseMatrixCSC(spcoo)
+#     elseif fmt == :csr
+#         return CuSparseMatrixCSR(spcoo)
+#     elseif fmt == :coo
+#         return spcoo
+#     else
+#         error("Format :$fmt not available, use :csc, :csr, or :coo.")
+#     end
+# end
+
+
+# Workaround for https://github.com/JuliaGPU/CUDA.jl/issues/1113#issuecomment-955759875
+function Base.:*(A::CuMatrix, B::CuSparseMatrixCSR)
+    @assert size(A, 2) == size(B, 1)
+    return CuMatrix((B' * A')')
+end
+
 
 @non_differentiable to_coo(x...)
 @non_differentiable to_dense(x...)
diff --git a/src/GNNGraphs/generate.jl b/src/GNNGraphs/generate.jl
index 67f5d1f1b..9f0eeb5d2 100644
--- a/src/GNNGraphs/generate.jl
+++ b/src/GNNGraphs/generate.jl
@@ -1,5 +1,5 @@
 """
-    rand_graph(n, m; bidirected=true, kws...)
+    rand_graph(n, m; bidirected=true, seed=-1, kws...)
 
 Generate a random (Erdós-Renyi) `GNNGraph` with `n` nodes
 and `m` edges.
@@ -43,10 +43,10 @@ julia> edge_index(g)
 
 ```
 """
-function rand_graph(n::Integer, m::Integer; bidirected=true, kws...)
+function rand_graph(n::Integer, m::Integer; bidirected=true, seed=-1, kws...)
     if bidirected
         @assert iseven(m) "Need even number of edges for bidirected graphs, given m=$m."
     end
     m2 = bidirected ? m÷2 : m
-    return GNNGraph(Graphs.erdos_renyi(n, m2, is_directed=!bidirected); kws...)    
+    return GNNGraph(Graphs.erdos_renyi(n, m2; is_directed=!bidirected, seed); kws...)    
 end
diff --git a/src/GNNGraphs/gnngraph.jl b/src/GNNGraphs/gnngraph.jl
index a0bf2440f..019586f05 100644
--- a/src/GNNGraphs/gnngraph.jl
+++ b/src/GNNGraphs/gnngraph.jl
@@ -192,8 +192,8 @@ function GNNGraph(g::GNNGraph; ndata=g.ndata, edata=g.edata, gdata=g.gdata, grap
             ndata, edata, gdata) 
 end
 
-function Base.show(io::IO, g::GNNGraph)
-    println(io, "GNNGraph:
+function Base.show(io::IO, g::GNNGraph{T}) where T
+    println(io, "GNNGraph{$T}:
     num_nodes = $(g.num_nodes)
     num_edges = $(g.num_edges)
     num_graphs = $(g.num_graphs)")
diff --git a/src/GNNGraphs/query.jl b/src/GNNGraphs/query.jl
index bb4fb6f29..5e580f284 100644
--- a/src/GNNGraphs/query.jl
+++ b/src/GNNGraphs/query.jl
@@ -74,12 +74,7 @@ function adjacency_list(g::GNNGraph; dir=:out)
 end
 
 function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType=Int; dir=:out)
-    if g.graph[1] isa CuVector
-        # TODO revisit after https://github.com/JuliaGPU/CUDA.jl/pull/1152
-        A, n, m = to_dense(g.graph, T, num_nodes=g.num_nodes)
-    else
-        A, n, m = to_sparse(g.graph, T, num_nodes=g.num_nodes)
-    end
+    A, n, m = to_sparse(g.graph, T, num_nodes=g.num_nodes)
     @assert size(A) == (n, n)
     return dir == :out ? A : A'
 end
diff --git a/src/msgpass.jl b/src/msgpass.jl
index 1611ebe56..93411261f 100644
--- a/src/msgpass.jl
+++ b/src/msgpass.jl
@@ -152,11 +152,32 @@ function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(+), xi, xj::AbstractM
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(copyxj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
-    propagate((xi,xj,e)->copyxj(xi,xj,e), g, +, xi, xj, e)
+# Have to define custom rule since CUDA.jl has troubles with some sparse-dense multiplications
+function ChainRulesCore.rrule(::typeof(propagate), ::typeof(copyxj), g::GNNGraph, 
+                              ::typeof(+), xi, xj::AbstractMatrix, e)
+    A = adjacency_matrix(g)
+    y = xj * A
+    function propagate_pullback(ȳ)
+        Ȳ = unthunk(ȳ)
+        dxj = Ȳ * A'
+        return NoTangent(), NoTangent(), NoTangent(), NoTangent(), NoTangent(), dxj, NoTangent()
+    end
+
+    function propagate_pullback(ȳ::CuMatrix)
+        Ȳ = unthunk(ȳ)
+        dxj = CuArray((A * Ȳ')')
+        return NoTangent(), NoTangent(), NoTangent(), NoTangent(), NoTangent(), dxj, NoTangent()
+    end
+
+    y, propagate_pullback
 end
 
+
+# ## avoid the fast path on gpu until we have better cuda support
+# function propagate(::typeof(copyxj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
+#     propagate((xi,xj,e) -> copyxj(xi,xj,e), g, +, xi, xj, e)
+# end
+
 # function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(mean), xi, xj::AbstractMatrix, e)
 #     A = adjacency_matrix(g)
 #     D = compute_degree(A)