diff --git a/perf/perf.jl b/perf/perf.jl
index dfebc0de2..d9b0f0e87 100644
--- a/perf/perf.jl
+++ b/perf/perf.jl
@@ -1,17 +1,30 @@
 using Flux, GraphNeuralNetworks, Graphs, BenchmarkTools, CUDA
 using DataFrames, Statistics, JLD2, SparseArrays
-CUDA.device!(2)
+using Unitful
+# CUDA.device!(2)
 CUDA.allowscalar(false)
 
-BenchmarkTools.ratio(::Missing, x) = Inf
-BenchmarkTools.ratio(x, ::Missing) = 0.0
-BenchmarkTools.ratio(::Missing, ::Missing) = missing
+function getres(res, str)
+    ismissing(res[str]) && return missing 
+    t = median(res[str]).time
+    if t < 1e3
+        t * u"ns"
+    elseif t < 1e6
+        t / 1e3 * u"μs"
+    elseif t < 1e9
+        t / 1e6 * u"ms"
+    else
+        t / 1e9 * u"s"
+    end
+end
 
 function run_single_benchmark(N, c, D, CONV; gtype=:lg)
-    data = erdos_renyi(N, c / (N-1), seed=17)
     X = randn(Float32, D, N)
-    
+
+    data = erdos_renyi(N, c / (N-1), seed=17)
     g = GNNGraph(data; ndata=X, graph_type=gtype)
+    
+    # g = rand_graph(N, c*N; ndata=X, graph_type=gtype)
     g_gpu = g |> gpu    
     
     m = CONV(D => D)
@@ -58,11 +71,12 @@ function run_benchmarks(;
         c = 6,
         D = 100,
         layers = [GCNConv, GATConv],
-        gtypes = [:coo, :sparse, :dense],
+        gtypes = [:coo],
         )
 
-    df = DataFrame(N=Int[], c=Float64[], layer=String[], gtype=Symbol[], 
-                   time_cpu=Any[], time_gpu=Any[]) |> allowmissing
+    df = DataFrame(N=Int[], c=Int[], layer=String[], gtype=Symbol[], 
+                   time_fwd_cpu=Any[], time_fwd_gpu=Any[],
+                   time_grad_cpu=Any[], time_grad_gpu=Any[])
     
     for gtype in gtypes
         for N in Ns
@@ -73,34 +87,37 @@ function run_benchmarks(;
                         N = N,
                         c = c,
                         gtype = gtype, 
-                        time_cpu = ismissing(res["CPU"]) ? missing : median(res["CPU"]),
-                        time_gpu = ismissing(res["GPU"]) ? missing : median(res["GPU"]),
+                        time_fwd_cpu = getres(res, "CPU_FWD"),
+                        time_fwd_gpu = getres(res, "GPU_FWD"),
+                        time_grad_cpu = getres(res, "CPU_GRAD"),
+                        time_grad_gpu = getres(res, "GPU_GRAD"),
                     )
                 push!(df, row)
+                println(row)
             end
         end
     end
 
-    df.gpu_to_cpu = ratio.(df.time_gpu, df.time_cpu)
+    df.grad_gpu_to_cpu = NoUnits.(df.time_grad_gpu ./ df.time_grad_cpu)
     sort!(df, [:layer, :N, :c, :gtype])
     return df
 end
 
-# df = run_benchmarks()
-# for g in groupby(df, :layer); println(g, "\n"); end
+df = run_benchmarks()
+for g in groupby(df, :layer); println(g, "\n"); end
 
-# @save "perf/perf_master_20210803_carlo.jld2" dfmaster=df
+# @save "master_2021_11_01_arrakis.jld2" dfmaster=df
 ## or
-# @save "perf/perf_pr.jld2" dfpr=df
+# @save "pr.jld2" dfpr=df
 
 
 function compare(dfpr, dfmaster; on=[:N, :c, :gtype, :layer])
     df = outerjoin(dfpr, dfmaster; on=on, makeunique=true, renamecols = :_pr => :_master)
-    df.pr_to_master_cpu = ratio.(df.time_cpu_pr, df.time_cpu_master)
-    df.pr_to_master_gpu = ratio.(df.time_gpu_pr, df.time_gpu_master) 
+    df.pr_to_master_cpu = df.time_cpu_pr ./ df.time_cpu_master
+    df.pr_to_master_gpu = df.time_gpu_pr ./ df.time_gpu_master 
     return df[:,[:N, :c, :gtype, :layer, :pr_to_master_cpu, :pr_to_master_gpu]]
 end
 
 # @load "perf/perf_pr.jld2" dfpr
 # @load "perf/perf_master.jld2" dfmaster
-# compare(dfpr, dfmaster)
+# compare(dfpr, dfmaster)
\ No newline at end of file
diff --git a/src/GNNGraphs/convert.jl b/src/GNNGraphs/convert.jl
index 8da7345a4..e2c7dea43 100644
--- a/src/GNNGraphs/convert.jl
+++ b/src/GNNGraphs/convert.jl
@@ -81,20 +81,16 @@ to_dense(A::AbstractSparseMatrix, x...; kws...) = to_dense(collect(A), x...; kws
 
 function to_dense(A::ADJMAT_T, T=nothing; dir=:out, num_nodes=nothing, weighted=true)
     @assert dir ∈ [:out, :in]
-    T = T === nothing ? eltype(A) : T
     num_nodes = size(A, 1)
     @assert num_nodes == size(A, 2)
-    # @assert all(x -> (x == 1) || (x == 0), A)
     num_edges = numnonzeros(A)
     if dir == :in
         A = A'
     end
-    if T != eltype(A)
-        A = T.(A)
-    end
     if !weighted
-        A = map(x -> ifelse(x > 0, T(1), T(0)), A)
+        A = binarize(A)
     end
+    A = convert_eltype(T, A)
     return A, num_nodes, num_edges
 end
 
@@ -128,10 +124,7 @@ function to_dense(coo::COO_T, T=nothing; dir=:out, num_nodes=nothing, weighted=t
     if val === nothing || !weighted  
         val = ones_like(s, T)            
     end
-    if eltype(val) != T
-        val = T.(val)
-    end
-    
+    val = convert_eltype(T, val)
     idxs = s .+ n .* (t .- 1) 
     
     ## using scatter instead of indexing since there could be multiple edges
@@ -149,20 +142,17 @@ function to_sparse(A::ADJMAT_T, T=nothing; dir=:out, num_nodes=nothing, weighted
     @assert dir ∈ [:out, :in]
     num_nodes = size(A, 1)
     @assert num_nodes == size(A, 2)
-    T = T === nothing ? eltype(A) : T
-    num_edges = A isa AbstractSparseMatrix ? nnz(A) : count(!=(0), A)
     if dir == :in
         A = A'
     end
-    if T != eltype(A)
-        A = T.(A)
-    end
     if !(A isa AbstractSparseMatrix)
-        A = sparse(A)
+        A = _sparse(A)
     end
     if !weighted
-        A = map(x -> ifelse(x > 0, T(1), T(0)), A)
+        A = binarize(A)
     end
+    A = convert_eltype(T, A)
+    num_edges = nnz(A)
     return A, num_nodes, num_edges
 end
 
@@ -180,10 +170,8 @@ function to_sparse(coo::COO_T, T=nothing; dir=:out, num_nodes=nothing, weighted=
     end
 
     num_nodes::Int = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes 
-    A = sparse(s, t, eweight, num_nodes, num_nodes)
+    A = _sparse(s, t, eweight, num_nodes, num_nodes)
     num_edges::Int = nnz(A)
-    if eltype(A) != T
-        A = T.(A)
-    end
+    A = convert_eltype(T, A)
     return A, num_nodes, num_edges
 end
diff --git a/src/GNNGraphs/query.jl b/src/GNNGraphs/query.jl
index f4efcf842..df39897ef 100644
--- a/src/GNNGraphs/query.jl
+++ b/src/GNNGraphs/query.jl
@@ -131,35 +131,32 @@ adjacency_list(g::GNNGraph; dir=:out) = adjacency_list(g, 1:g.num_nodes; dir)
 
 
 """
-    adjacency_matrix(g::GNNGraph, T=eltype(g); dir=:out, weighted=true)
+    adjacency_matrix(g::GNNGraph, [T]; dir=:out, weighted=true)
 
 Return the adjacency matrix `A` for the graph `g`. 
 
 If `dir=:out`, `A[i,j] > 0` denotes the presence of an edge from node `i` to node `j`.
 If `dir=:in` instead, `A[i,j] > 0` denotes the presence of an edge from node `j` to node `i`.
 
-User may specify the eltype `T` of the returned matrix. 
+The user can specify the eltype `T` of the returned matrix. 
 
 If `weighted=true`, the `A` will contain the edge weigths if any, otherwise the elements of `A` will be either 0 or 1.
 """
-function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType=eltype(g); dir=:out, weighted=true)
-    if g.graph[1] isa CuVector
-        # TODO revisit after https://github.com/JuliaGPU/CUDA.jl/pull/1152
-        A, n, m = to_dense(g.graph, T; num_nodes=g.num_nodes, weighted)
-    else
-        A, n, m = to_sparse(g.graph, T; num_nodes=g.num_nodes, weighted)
-    end
-    @assert size(A) == (n, n)
+function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::TT=eltype(g); dir=:out, weighted=true) where 
+        {TT <: Union{DataType}}
+    A, num_nodes, num_edges = to_sparse(g.graph, T; num_nodes=g.num_nodes, weighted)
+    @assert size(A) == (num_nodes, num_nodes)
     return dir == :out ? A : A'
 end
 
-function Graphs.adjacency_matrix(g::GNNGraph{<:ADJMAT_T}, T::DataType=eltype(g); dir=:out, weighted=true)
+function Graphs.adjacency_matrix(g::GNNGraph{<:ADJMAT_T}, T::TT=eltype(g); dir=:out, weighted=true) where
+        {TT <: Union{DataType}}
     @assert dir ∈ [:in, :out]
     A = g.graph
     if !weighted
         A = binarize(A)
     end
-    A = T != eltype(A) ? T.(A) : A
+    A = convert_eltype(T, A)
     return dir == :out ? A : A'
 end
 
@@ -177,7 +174,7 @@ function _get_edge_weight(g, edge_weight)
 end
 
 """
-    degree(g::GNNGraph, T=nothing; dir=:out, edge_weight=true)
+    degree(g::GNNGraph, [T]; dir=:out, edge_weight=true)
 
 Return a vector containing the degrees of the nodes in `g`.
 
@@ -234,7 +231,7 @@ function Graphs.degree(g::GNNGraph{<:ADJMAT_T}, T::TT=nothing; dir=:out, edge_we
     if edge_weight === false
         A = binarize(A)
     end
-    A = eltype(A) != T ? T.(A) : A
+    A = convert_eltype(T, A)
     return dir == :out ? vec(sum(A, dims=2)) : 
            dir == :in  ? vec(sum(A, dims=1)) :
                   vec(sum(A, dims=1)) .+ vec(sum(A, dims=2)) 
diff --git a/src/GNNGraphs/utils.jl b/src/GNNGraphs/utils.jl
index 738291342..3339753d1 100644
--- a/src/GNNGraphs/utils.jl
+++ b/src/GNNGraphs/utils.jl
@@ -98,6 +98,7 @@ ones_like(x, sz=size(x)) = ones_like(x, eltype(x), sz)
 numnonzeros(a::AbstractSparseMatrix) = nnz(a)
 numnonzeros(a::AbstractMatrix) = count(!=(0), a)
 
+
 # each edge is represented by a number in
 # 1:N^2
 function edge_encoding(s, t, n; directed=true)
@@ -151,11 +152,56 @@ end
 
 binarize(x) = map(>(0), x)
 
+@non_differentiable numnonzeros(x...)
 @non_differentiable binarize(x...)
 @non_differentiable edge_encoding(x...)
 @non_differentiable edge_decoding(x...)
 
+convert_eltype(::Nothing, x) = x
+convert_eltype(::Type{T}, x::AbstractArray{T}) where T = x
+convert_eltype(::Type{T}, x::AbstractArray) where T = T.(x)
+
+_sparse(x::AbstractMatrix) = sparse(x)
+_sparse(x::AbstractVector) = sparse(x)
+_sparse(s, t, w, m, n) = sparse(s, t, w, m, n)
+
+using CUDA.CUSPARSE: CuSparseMatrixCSR, AbstractCuSparseMatrix
+    
+# This is working around 2 issues:
+# https://github.com/JuliaGPU/CUDA.jl/issues/1402
+# https://github.com/JuliaGPU/CUDA.jl/issues/1407
+function _sparse(s::AnyCuVector, t::AnyCuVector, w::AnyCuVector{T}, m, n) where T
+    p = sortperm(s) # issue CUDA#1407
+    s, t, w = s[p], t[p], w[p]
+    T.(sparse(s, t, Float32.(w), m, n))
+end
+
+# TODO https://github.com/JuliaGPU/CUDA.jl/issues/1403
+Base.:*(x::AnyCuMatrix, y::AbstractCuSparseMatrix) = (y' * x')' |> CuMatrix
+
+# Workaround https://github.com/JuliaGPU/CUDA.jl/issues/1406
+Base.sum(x::AbstractCuSparseMatrix; dims=:) = cusparse_sum(x, Val(dims))
 
+cusparse_sum(x, ::Val{:}) = sum(cusparse_sum(x, Val(1)))
+
+function cusparse_sum(x::AbstractCuSparseMatrix, ::Val{1})
+    m, n = size(x)
+    v = ones_like(x, (1, m))
+    return v * x
+end
+
+function cusparse_sum(x::AbstractCuSparseMatrix, ::Val{2})
+    m, n = size(x)
+    v = ones_like(x, (n, 1))
+    return x * v
+end
+
+# # TODO remove this piracy when this is merged
+# # https://github.com/JuliaGPU/CUDA.jl/pull/1401
+# function CUDA.cu(x::SparseMatrixCSC)
+#     # Avoid casting to CuSparseMatrixCSC since it is not well supported
+#     CuSparseMatrixCSR(x)
+# end
 
 ####################################
 # FROM MLBASE.jl
@@ -214,4 +260,5 @@ function getobs!(buffers::Union{Tuple, NamedTuple},
                 getobs!(buffer, x, indices)
             end
 end
-#######################################################
\ No newline at end of file
+#######################################################
+
diff --git a/src/msgpass.jl b/src/msgpass.jl
index cc840c947..fc679f2ff 100644
--- a/src/msgpass.jl
+++ b/src/msgpass.jl
@@ -189,11 +189,6 @@ function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstract
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
-    propagate((xi,xj,e) -> copy_xj(xi,xj,e), g, +, xi, xj, e)
-end
-
 ## E_MUL_XJ 
 
 # for weighted convolution
@@ -203,11 +198,6 @@ function propagate(::typeof(e_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(e_mul_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e::AbstractVector)
-    propagate((xi,xj,e) -> e_mul_xj(xi,xj,e), g, +, xi, xj, e)
-end
-
 ## W_MUL_XJ 
 
 # for weighted convolution
@@ -216,11 +206,6 @@ function propagate(::typeof(w_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
     return xj * A
 end
 
-## avoid the fast path on gpu until we have better cuda support
-function propagate(::typeof(w_mul_xj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e::Nothing)
-    propagate((xi,xj,e) -> w_mul_xj(xi,xj,e), g, +, xi, xj, e)
-end
-
 
 
 
diff --git a/test.jl b/test.jl
new file mode 100644
index 000000000..3b7b997e8
--- /dev/null
+++ b/test.jl
@@ -0,0 +1,69 @@
+using GraphNeuralNetworks, Random, Flux, Test, CUDA, SparseArrays, CUDA.CUSPARSE
+
+Random.seed!(17)
+g = rand_graph(6, 14)
+@test !has_self_loops(g)
+x = rand(2, g.num_nodes)
+l = GCNConv(2 => 2)
+y = l(g, x)
+s, t = edge_index(g)
+A = adjacency_matrix(g)
+
+g_gpu = g |> gpu
+x_gpu = x |> gpu
+l_gpu = l |> gpu
+s_gpu, t_gpu = edge_index(g_gpu)
+y_gpu = l_gpu(g_gpu, x_gpu)
+A_gpu = adjacency_matrix(g_gpu)
+
+@test Array(s_gpu) ≈ s
+@test Array(t_gpu) ≈ t
+
+@test Array(A_gpu) ≈ Array(A)
+@test Array(degree(g_gpu)) ≈ Array(degree(g))
+
+@test Array(y_gpu) ≈ y
+
+
+
+# @testset "Conv Layers" begin
+#     in_channel = 3
+#     out_channel = 5
+#     N = 4
+#     T = Float32
+
+#     adj1 =  [0 1 0 1
+#              1 0 1 0
+#              0 1 0 1
+#              1 0 1 0]
+    
+#     g1 = GNNGraph(adj1, 
+#             ndata=rand(T, in_channel, N), 
+#             graph_type=GRAPH_T)
+        
+#     adj_single_vertex =  [0 0 0 1
+#                           0 0 0 0
+#                           0 0 0 1
+#                           1 0 1 0]
+    
+#     g_single_vertex = GNNGraph(adj_single_vertex, 
+#                                 ndata=rand(T, in_channel, N), 
+#                                 graph_type=GRAPH_T)    
+
+#     test_graphs = [g1, g_single_vertex]
+
+#     @testset "GCNConv" begin
+#         l = GCNConv(in_channel => out_channel)
+#         for g in test_graphs
+#             test_layer(l, g, rtol=1e-5, outsize=(out_channel, g.num_nodes))
+#         end
+
+#         l = GCNConv(in_channel => out_channel, tanh, bias=false)
+#         for g in test_graphs
+#             test_layer(l, g, rtol=1e-5, outsize=(out_channel, g.num_nodes))
+#         end
+
+#         l = GCNConv(in_channel => out_channel, add_self_loops=false)
+#         test_layer(l, g1, rtol=1e-5, outsize=(out_channel, g1.num_nodes))
+#     end
+# end
\ No newline at end of file
diff --git a/test/GNNGraphs/gnngraph.jl b/test/GNNGraphs/gnngraph.jl
index 08efe3ca5..d87b0cf33 100644
--- a/test/GNNGraphs/gnngraph.jl
+++ b/test/GNNGraphs/gnngraph.jl
@@ -81,21 +81,25 @@
             @test adjacency_matrix(g; dir=:out) == adj_mat
             
             if TEST_GPU
-                # See https://github.com/JuliaGPU/CUDA.jl/pull/1093
                 mat_gpu = adjacency_matrix(g_gpu)
-                @test mat_gpu isa ACUMatrix{Int}
+                if GRAPH_T == :dense
+                    @test mat_gpu isa CuMatrix{Int}
+                else
+                    @test mat_gpu isa CuSparseMatrix
+                    # @test_broken mat_gpu isa CuSparseMatrix{Int}
+                end
                 @test Array(mat_gpu) == adj_mat 
             end
         end
         
-        @testset "normalized_laplacian" begin
-            mat = normalized_laplacian(g)
-            if TEST_GPU
-                mat_gpu = normalized_laplacian(g_gpu)
-                @test mat_gpu isa ACUMatrix{Float32}
-                @test Array(mat_gpu) == mat 
-            end
-        end
+        # @testset "normalized_laplacian" begin
+        #     mat = normalized_laplacian(g)
+        #     if TEST_GPU
+        #         mat_gpu = normalized_laplacian(g_gpu)
+        #         @test mat_gpu isa ACUMatrix{Float32}
+        #         @test Array(mat_gpu) == mat 
+        #     end
+        # end
 
 
         @testset "scaled_laplacian" begin
diff --git a/test/GNNGraphs/query.jl b/test/GNNGraphs/query.jl
index 4d4c88a14..118e11248 100644
--- a/test/GNNGraphs/query.jl
+++ b/test/GNNGraphs/query.jl
@@ -124,7 +124,11 @@
         A = adjacency_matrix(g, Float32)
         @test A ≈ a
         @test eltype(A) == Float32
-    
+        if GRAPH_T == :dense
+            A isa AbstractSparseMatrix{Float32}
+        else
+            A isa Matrix{Float32}
+        end    
         Abin = adjacency_matrix(g, Float32, weighted=false)
         @test Abin ≈ abin
         @test eltype(Abin) == Float32    
@@ -148,5 +152,20 @@
 
             @test gw == [1,1,1]
         end
+
+        if TEST_GPU 
+            g = rand_graph(10, 30, graph_type=GRAPH_T)
+            A = adjacency_matrix(g)
+            
+            g_gpu = g |> gpu
+            A_gpu = adjacency_matrix(g_gpu)
+            
+            if GRAPH_T == :dense
+                @test A_gpu isa CuMatrix
+            else
+                @test A_gpu isa CuSparseMatrix
+            end
+            @test Array(A_gpu) == Array(A)
+        end
     end
 end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 9aa6462b8..4494392cd 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -104,7 +104,7 @@
         for heads in (1, 2), concat in (true, false)
             l = GATConv(in_channel => out_channel; heads, concat)
             for g in test_graphs
-                test_layer(l, g, rtol=1e-3,
+                test_layer(l, g, rtol=1e-2,
                     outsize=(concat ? heads*out_channel : out_channel, g.num_nodes))
             end
         end
@@ -113,7 +113,7 @@
             ein = 3
             l = GATConv((in_channel, ein) => out_channel, add_self_loops=false)
             g = GNNGraph(g1, edata=rand(T, ein, g1.num_edges))
-            test_layer(l, g, rtol=1e-3, outsize=(out_channel, g.num_nodes))
+            test_layer(l, g, rtol=1e-2, outsize=(out_channel, g.num_nodes))
         end
 
         @testset "num params" begin
@@ -131,7 +131,7 @@
         for heads in (1, 2), concat in (true, false)
             l = GATv2Conv(in_channel => out_channel, tanh; heads, concat)
             for g in test_graphs
-                test_layer(l, g, rtol=1e-3,
+                test_layer(l, g, rtol=1e-2,
                     outsize=(concat ? heads*out_channel : out_channel, g.num_nodes))
             end
         end
@@ -140,7 +140,7 @@
             ein = 3
             l = GATv2Conv((in_channel, ein) => out_channel, add_self_loops=false)
             g = GNNGraph(g1, edata=rand(T, ein, g1.num_edges))
-            test_layer(l, g, rtol=1e-3, outsize=(out_channel, g.num_nodes))
+            test_layer(l, g, rtol=1e-2, outsize=(out_channel, g.num_nodes))
         end
 
         @testset "num params" begin
@@ -156,7 +156,7 @@
             ein = 3
             l = GATv2Conv((in_channel, ein) => out_channel, add_self_loops=false)
             g = GNNGraph(g1, edata=rand(T, ein, g1.num_edges))
-            test_layer(l, g, rtol=1e-3, outsize=(out_channel, g.num_nodes))
+            test_layer(l, g, rtol=1e-2, outsize=(out_channel, g.num_nodes))
         end 
     end
 
@@ -246,7 +246,7 @@
         l = MEGNetConv(in_channel => out_channel, aggr=+)
         for g in test_graphs
             g = GNNGraph(g, edata=rand(T, in_channel, g.num_edges))
-            test_layer(l, g, rtol=1e-3,
+            test_layer(l, g, rtol=1e-2,
                 outtype=:node_edge, 
                 outsize=((out_channel, g.num_nodes), (out_channel, g.num_edges))) 
         end
diff --git a/test/runtests.jl b/test/runtests.jl
index 41c60fee8..c4bae3fca 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,6 +2,7 @@ using GraphNeuralNetworks
 using GraphNeuralNetworks.GNNGraphs: sort_edge_index
 using Flux
 using CUDA
+using CUDA.CUSPARSE
 using Flux: gpu, @functor
 using LinearAlgebra, Statistics, Random
 using NNlib
@@ -16,7 +17,7 @@ using InlineStrings  # not used but with the import we test #98 and #104
 
 CUDA.allowscalar(false)
 
-const ACUMatrix{T} = Union{CuMatrix{T}, CUDA.CUSPARSE.CuSparseMatrix{T}}
+const ACUMatrix{T} = Union{CuMatrix{T}, CuSparseMatrix{T}}
 
 ENV["DATADEPS_ALWAYS_ACCEPT"] = true # for MLDatasets
 
@@ -32,18 +33,19 @@ tests = [
     "GNNGraphs/sampling",
     "utils",
     "msgpass",
-    "layers/basic",
-    "layers/conv",
-    "layers/pool",
-    "examples/node_classification_cora",
-    "deprecations",
+    # "layers/basic",
+    # "layers/conv",
+    # "layers/pool",
+    # "examples/node_classification_cora",
+    # "deprecations",
 ]
 
 !CUDA.functional() && @warn("CUDA unavailable, not testing GPU support")
 
 @testset "GraphNeuralNetworks: graph format $graph_type" for graph_type in (:coo, :dense, :sparse) 
     global GRAPH_T = graph_type
-    global TEST_GPU = CUDA.functional() && (GRAPH_T != :sparse)
+    # global TEST_GPU = CUDA.functional() && (GRAPH_T != :sparse)
+    global TEST_GPU = true
 
     for t in tests
         startswith(t, "examples") && GRAPH_T == :dense && continue     # not testing :dense since causes OutOfMememory on github's CI