From 87f3862db8bfcde9a7921f456007368bf789cd81 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 16 Sep 2024 18:59:48 -0300
Subject: [PATCH 1/8] Copy-paste CUDA benchmarks

---
 perf/.gitignore       |   2 +
 perf/Project.toml     |   6 +
 perf/array.jl         | 110 +++++++++++++++++
 perf/byval.jl         |  79 ++++++++++++
 perf/kernel.jl        |  33 +++++
 perf/latency.jl       |  39 ++++++
 perf/metal.jl         |  14 +++
 perf/metaldevrt.jl    |  42 +++++++
 perf/runbenchmarks.jl | 128 ++++++++++++++++++++
 perf/volumerhs.jl     | 273 ++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 726 insertions(+)
 create mode 100644 perf/.gitignore
 create mode 100644 perf/Project.toml
 create mode 100644 perf/array.jl
 create mode 100644 perf/byval.jl
 create mode 100644 perf/kernel.jl
 create mode 100644 perf/latency.jl
 create mode 100644 perf/metal.jl
 create mode 100644 perf/metaldevrt.jl
 create mode 100644 perf/runbenchmarks.jl
 create mode 100644 perf/volumerhs.jl

diff --git a/perf/.gitignore b/perf/.gitignore
new file mode 100644
index 000000000..124aa781c
--- /dev/null
+++ b/perf/.gitignore
@@ -0,0 +1,2 @@
+results.json
+reference.json
diff --git a/perf/Project.toml b/perf/Project.toml
new file mode 100644
index 000000000..8314c7b28
--- /dev/null
+++ b/perf/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/perf/array.jl b/perf/array.jl
new file mode 100644
index 000000000..a6a226ccf
--- /dev/null
+++ b/perf/array.jl
@@ -0,0 +1,110 @@
+group = addgroup!(SUITE, "array")
+
+const m = 512
+const n = 1000
+
+# generate some arrays
+cpu_mat = rand(rng, Float32, m, n)
+gpu_mat = CuArray{Float32}(undef, size(cpu_mat))
+gpu_vec = reshape(gpu_mat, length(gpu_mat))
+gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
+gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
+gpu_mat_ints = CuArray(rand(rng, Int, m, n))
+gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
+gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
+gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
+
+group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
+
+group["copy"] = @async_benchmarkable copy($gpu_mat)
+
+gpu_mat2 = copy(gpu_mat)
+let group = addgroup!(group, "copyto!")
+    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
+    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
+    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
+end
+
+let group = addgroup!(group, "iteration")
+    group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
+
+    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+
+    let group = addgroup!(group, "findall")
+        group["bool"] = @benchmarkable findall($gpu_vec_bools)
+        group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findfirst")
+        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+        group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findmin") # findmax
+        group["1d"] = @async_benchmarkable findmin($gpu_vec)
+        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+    end
+end
+
+let group = addgroup!(group, "reverse")
+    group["1d"] = @async_benchmarkable reverse($gpu_vec)
+    group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
+    group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
+    group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
+end
+
+group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+
+# no need to test inplace version, which performs the same operation (but with an alloc)
+let group = addgroup!(group, "accumulate")
+    group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
+    group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
+end
+
+let group = addgroup!(group, "reductions")
+    let group = addgroup!(group, "reduce")
+        group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
+        group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+    end
+
+    let group = addgroup!(group, "mapreduce")
+        group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
+        group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
+    end
+
+    # used by sum, prod, minimum, maximum, all, any, count
+end
+
+let group = addgroup!(group, "random")
+    let group = addgroup!(group, "rand")
+        group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
+        group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
+    end
+
+    let group = addgroup!(group, "rand!")
+        group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
+        group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "randn")
+        group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
+        #group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n)
+    end
+
+    let group = addgroup!(group, "randn!")
+        group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
+        #group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints)
+    end
+end
+
+let group = addgroup!(group, "sorting")
+    group["1d"] = @async_benchmarkable sort($gpu_vec)
+    group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+    group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+end
+
+let group = addgroup!(group, "permutedims")
+    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
+    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
+    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
+end
diff --git a/perf/byval.jl b/perf/byval.jl
new file mode 100644
index 000000000..d32d62a9c
--- /dev/null
+++ b/perf/byval.jl
@@ -0,0 +1,79 @@
+module ByVal
+
+using CUDA, BenchmarkTools, Random
+using CUDA: i32
+
+const threads = 256
+
+# simple add matrixes kernel
+function kernel_add_mat(n, x1, x2, y)
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+@inline get_inputs3(indx_y, a, b, c)                            = (a, b, c)
+@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2)             = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
+@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)
+
+# add arrays of matrixes kernel
+function kernel_add_mat_z_slices(n, vararg...)
+    x1, x2, y = get_inputs3(blockIdx().y, vararg...)
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+function add_z_slices!(y, x1, x2)
+    m1, n1 = size(x1[1]) #get size of first slice
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    # get length(x1) more blocks than needed to process 1 slice
+    @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
+end
+
+function add!(y, x1, x2)
+    m1, n1 = size(x1)
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    @cuda blocks = blocks, 1          threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
+end
+
+function main()
+    results = BenchmarkGroup()
+
+    num_z_slices = 3
+    Random.seed!(1)
+
+    #m, n = 7, 5          # tiny to measure overhead
+    #m, n = 521, 111
+    #m, n = 1521, 1111
+    #m, n = 3001, 1511    # prime numbers to test memory access correctness
+    m, n = 3072, 1536    # 256 multiplier
+    #m, n = 6007, 3001    # prime numbers to test memory access correctness
+
+    x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    y1 = [similar(x1[1]) for i = 1:num_z_slices]
+
+    # reference down to bones add on GPU
+    results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
+
+    # adding arrays in an array
+    for slices = 1:num_z_slices
+        results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+    end
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    CUDA.unsafe_free!.(x1)
+    CUDA.unsafe_free!.(x2)
+    CUDA.unsafe_free!.(y1)
+
+    return results
+end
+
+end
+
+ByVal.main()
diff --git a/perf/kernel.jl b/perf/kernel.jl
new file mode 100644
index 000000000..1022a774a
--- /dev/null
+++ b/perf/kernel.jl
@@ -0,0 +1,33 @@
+using CUDA: i32
+
+group = addgroup!(SUITE, "kernel")
+
+group["launch"] = @benchmarkable @cuda identity(nothing)
+
+group["occupancy"] = @benchmarkable begin
+    kernel = @cuda launch=false identity(nothing)
+    launch_configuration(kernel.fun)
+end
+
+src = CUDA.rand(Float32, 512, 1000)
+dest = similar(src)
+function indexing_kernel(dest, src)
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    @inbounds dest[i] = src[i]
+    return
+end
+group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src)
+
+function checked_indexing_kernel(dest, src)
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    dest[i] = src[i]
+    return
+end
+group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
+
+function rand_kernel(dest::AbstractArray{T}) where {T}
+    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    dest[i] = rand(T)
+    return
+end
+group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)
diff --git a/perf/latency.jl b/perf/latency.jl
new file mode 100644
index 000000000..9cd201353
--- /dev/null
+++ b/perf/latency.jl
@@ -0,0 +1,39 @@
+module Latency
+
+using CUDA
+using BenchmarkTools
+
+function main()
+    results = BenchmarkGroup()
+
+    base_cmd = Base.julia_cmd()
+    if Base.JLOptions().project != C_NULL
+        base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
+    end
+    # NOTE: we don't ust Base.active_project() here because of how CI launches this script,
+    #       starting with --project in the main CUDA.jl project.
+
+    # time to precompile the package and its dependencies
+    precompile_cmd =
+        `$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
+                       Base.compilecache(pkg)"`
+    results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
+
+    # time to actually import the package
+    import_cmd =
+        `$base_cmd -e "using CUDA"`
+    results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
+
+    # time to actually compile a kernel
+    ttfp_cmd =
+        `$base_cmd -e "using CUDA
+                       kernel() = return
+                       CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
+    results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
+
+    results
+end
+
+end
+
+Latency.main()
diff --git a/perf/metal.jl b/perf/metal.jl
new file mode 100644
index 000000000..5dbcbf0ab
--- /dev/null
+++ b/perf/metal.jl
@@ -0,0 +1,14 @@
+group = addgroup!(SUITE, "cuda")
+
+let group = addgroup!(group, "synchronization")
+    let group = addgroup!(group, "stream")
+        group["blocking"] = @benchmarkable synchronize(blocking=true)
+        group["auto"] = @benchmarkable synchronize()
+        group["nonblocking"] = @benchmarkable synchronize(spin=false)
+    end
+    let group = addgroup!(group, "context")
+        group["blocking"] = @benchmarkable device_synchronize(blocking=true)
+        group["auto"] = @benchmarkable device_synchronize()
+        group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
+    end
+end
diff --git a/perf/metaldevrt.jl b/perf/metaldevrt.jl
new file mode 100644
index 000000000..08348fab5
--- /dev/null
+++ b/perf/metaldevrt.jl
@@ -0,0 +1,42 @@
+module cudadevrt
+
+using CUDA, BenchmarkTools, Random
+
+const threads = 256
+#simple add matrix and vector kernel
+function kernel_add_mat_vec(m, x1, x2, y)
+    # one block per column
+    offset = (blockIdx().x-1) * m
+    @inbounds xtmp = x2[blockIdx().x]
+    for i = threadIdx().x : blockDim().x : m
+        @inbounds y[offset + i] = x1[offset + i] + xtmp
+    end
+    return
+end
+
+function add!(y, x1, x2)
+    m, n = size(x1)
+    @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
+end
+
+function main()
+    Random.seed!(1)
+    m, n = 3072, 1536    # 256 multiplier
+    x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
+    x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
+    y1 = similar(x1)
+
+    results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    CUDA.unsafe_free!(x1)
+    CUDA.unsafe_free!(x2)
+    CUDA.unsafe_free!(y1)
+
+    return results
+end
+
+end
+
+cudadevrt.main()
+
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
new file mode 100644
index 000000000..5969dcd9b
--- /dev/null
+++ b/perf/runbenchmarks.jl
@@ -0,0 +1,128 @@
+# benchmark suite execution and codespeed submission
+
+using CUDA
+
+using BenchmarkTools
+
+using StableRNGs
+rng = StableRNG(123)
+
+# we only submit results when running on the master branch
+real_run = get(ENV, "CODESPEED_BRANCH", nothing) == "master"
+if real_run
+    # to find untuned benchmarks
+    BenchmarkTools.DEFAULT_PARAMETERS.evals = 0
+end
+
+# convenience macro to create a benchmark that requires synchronizing the GPU
+macro async_benchmarkable(ex...)
+    quote
+        @benchmarkable CUDA.@sync blocking=true $(ex...)
+    end
+end
+
+# before anything else, run latency benchmarks. these spawn subprocesses, so we don't want
+# to do so after regular benchmarks have caused the memory allocator to reserve memory.
+@info "Running latency benchmarks"
+latency_results = include("latency.jl")
+
+SUITE = BenchmarkGroup()
+
+# NOTE: don't use spaces in benchmark names (tobami/codespeed#256)
+
+include("cuda.jl")
+include("kernel.jl")
+include("array.jl")
+
+if real_run
+    @info "Preparing main benchmarks"
+    warmup(SUITE; verbose=false)
+    tune!(SUITE)
+
+    # reclaim memory that might have been used by the tuning process
+    GC.gc(true)
+    CUDA.reclaim()
+end
+
+# benchmark groups that aren't part of the suite
+addgroup!(SUITE, "integration")
+
+@info "Running main benchmarks"
+results = run(SUITE, verbose=true)
+
+# integration tests (that do nasty things, so need to be run last)
+@info "Running integration benchmarks"
+integration_results = BenchmarkGroup()
+integration_results["volumerhs"] = include("volumerhs.jl")
+integration_results["byval"] = include("byval.jl")
+integration_results["cudadevrt"] = include("cudadevrt.jl")
+
+results["latency"] = latency_results
+results["integration"] = integration_results
+
+println(results)
+
+
+## comparison
+
+# write out the results
+BenchmarkTools.save(joinpath(@__DIR__, "results.json"), results)
+
+# compare against previous results
+# TODO: store these results so that we can compare when benchmarking PRs
+reference_path = joinpath(@__DIR__, "reference.json")
+if ispath(reference_path)
+    reference = BenchmarkTools.load(reference_path)[1]
+    comparison = judge(minimum(results), minimum(reference))
+
+    println("Improvements:")
+    println(improvements(comparison))
+
+    println("Regressions:")
+    println(regressions(comparison))
+end
+
+
+## submission
+
+using JSON, HTTP
+
+if real_run
+    @info "Submitting to Codespeed..."
+
+    basedata = Dict(
+        "branch"        => ENV["CODESPEED_BRANCH"],
+        "commitid"      => ENV["CODESPEED_COMMIT"],
+        "project"       => ENV["CODESPEED_PROJECT"],
+        "environment"   => ENV["CODESPEED_ENVIRONMENT"],
+        "executable"    => ENV["CODESPEED_EXECUTABLE"]
+    )
+
+    # convert nested groups of benchmark to flat dictionaries of results
+    flat_results = []
+    function flatten(results, prefix="")
+        for (key,value) in results
+            if value isa BenchmarkGroup
+                flatten(value, "$prefix$key/")
+            else
+                @assert value isa BenchmarkTools.Trial
+
+                # codespeed reports maxima, but those are often very noisy.
+                # get rid of measurements that unnecessarily skew the distribution.
+                rmskew!(value)
+
+                push!(flat_results,
+                    Dict(basedata...,
+                        "benchmark" => "$prefix$key",
+                        "result_value" => median(value).time / 1e9,
+                        "min" => minimum(value).time / 1e9,
+                        "max" => maximum(value).time / 1e9))
+            end
+        end
+    end
+    flatten(results)
+
+    HTTP.post("$(ENV["CODESPEED_SERVER"])/result/add/json/",
+                ["Content-Type" => "application/x-www-form-urlencoded"],
+                HTTP.URIs.escapeuri(Dict("json" => JSON.json(flat_results))))
+end
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
new file mode 100644
index 000000000..5c7737f57
--- /dev/null
+++ b/perf/volumerhs.jl
@@ -0,0 +1,273 @@
+module VolumeRHS
+
+using BenchmarkTools
+using CUDA
+using StableRNGs
+using StaticArrays
+
+function loopinfo(name, expr, nodes...)
+    if expr.head != :for
+        error("Syntax error: pragma $name needs a for loop")
+    end
+    push!(expr.args[2].args, Expr(:loopinfo, nodes...))
+    return expr
+end
+
+macro unroll(expr)
+    expr = loopinfo("@unroll", expr, (Symbol("llvm.loop.unroll.full"),))
+    return esc(expr)
+end
+
+# HACK: module-local versions of core arithmetic; needed to get FMA
+for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
+    for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
+        ir = """
+            %x = f$f contract nsz $llvmT %0, %1
+            ret $llvmT %x
+        """
+        @eval begin
+            # the @pure is necessary so that we can constant propagate.
+            @inline Base.@pure function $jlf(a::$T, b::$T)
+                Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
+            end
+        end
+    end
+    @eval function $jlf(args...)
+        Base.$jlf(args...)
+    end
+end
+
+let (jlf, f) = (:div_arcp, :div)
+    for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
+        ir = """
+            %x = f$f fast $llvmT %0, %1
+            ret $llvmT %x
+        """
+        @eval begin
+            # the @pure is necessary so that we can constant propagate.
+            @inline Base.@pure function $jlf(a::$T, b::$T)
+                Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
+            end
+        end
+    end
+    @eval function $jlf(args...)
+        Base.$jlf(args...)
+    end
+end
+rcp(x) = div_arcp(one(x), x) # still leads to rcp.rn which is also a function call
+
+# div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
+# rcp(x) = div_fast(one(x), x)
+
+# note the order of the fields below is also assumed in the code.
+const _nstate = 5
+const _ρ, _U, _V, _W, _E = 1:_nstate
+const stateid = (ρ = _ρ, U = _U, V = _V, W = _W, E = _E)
+
+const _nvgeo = 14
+const _ξx, _ηx, _ζx, _ξy, _ηy, _ζy, _ξz, _ηz, _ζz, _MJ, _MJI,
+_x, _y, _z = 1:_nvgeo
+const vgeoid = (ξx = _ξx, ηx = _ηx, ζx = _ζx,
+                ξy = _ξy, ηy = _ηy, ζy = _ζy,
+                ξz = _ξz, ηz = _ηz, ζz = _ζz,
+                MJ = _MJ, MJI = _MJI,
+                x = _x,   y = _y,   z = _z)
+
+const N = 4
+const nmoist = 0
+const ntrace = 0
+
+Base.@irrational grav 9.81 BigFloat(9.81)
+Base.@irrational gdm1 0.4 BigFloat(0.4)
+
+function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
+    Q = Base.Experimental.Const(Q)
+    vgeo = Base.Experimental.Const(vgeo)
+    D = Base.Experimental.Const(D)
+
+    nvar = _nstate + nmoist + ntrace
+
+    Nq = N + 1
+
+    s_D = CuStaticSharedArray(eltype(D), (Nq, Nq))
+    s_F = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
+    s_G = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
+
+    r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+    r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+    r_rhsV = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+    r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+    r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef)
+
+    e = blockIdx().x
+    j = threadIdx().y
+    i = threadIdx().x
+
+    @inbounds begin
+        for k in 1:Nq
+            r_rhsρ[k] = zero(eltype(rhs))
+            r_rhsU[k] = zero(eltype(rhs))
+            r_rhsV[k] = zero(eltype(rhs))
+            r_rhsW[k] = zero(eltype(rhs))
+            r_rhsE[k] = zero(eltype(rhs))
+        end
+
+        # fetch D into shared
+        s_D[i, j] = D[i, j]
+        @unroll for k in 1:Nq
+            sync_threads()
+
+            # Load values will need into registers
+            MJ = vgeo[i, j, k, _MJ, e]
+            ξx, ξy, ξz = vgeo[i,j,k,_ξx,e], vgeo[i,j,k,_ξy,e], vgeo[i,j,k,_ξz,e]
+            ηx, ηy, ηz = vgeo[i,j,k,_ηx,e], vgeo[i,j,k,_ηy,e], vgeo[i,j,k,_ηz,e]
+            ζx, ζy, ζz = vgeo[i,j,k,_ζx,e], vgeo[i,j,k,_ζy,e], vgeo[i,j,k,_ζz,e]
+            z = vgeo[i,j,k,_z,e]
+
+            U, V, W = Q[i, j, k, _U, e], Q[i, j, k, _V, e], Q[i, j, k, _W, e]
+            ρ, E = Q[i, j, k, _ρ, e], Q[i, j, k, _E, e]
+
+            # GPU performance trick
+            # Allow optimizations to use the reciprocal of an argument rather than perform division.
+            # IEEE floating-point division is implemented as a function call
+            ρinv = rcp(ρ)
+            ρ2inv = rcp(2ρ)
+            # ρ2inv = 0.5f0 * pinv
+
+            P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z)
+
+            fluxρ_x = U
+            fluxU_x = ρinv * U * U + P
+            fluxV_x = ρinv * U * V
+            fluxW_x = ρinv * U * W
+            fluxE_x = ρinv * U * (E + P)
+
+            fluxρ_y = V
+            fluxU_y = ρinv * V * U
+            fluxV_y = ρinv * V * V + P
+            fluxW_y = ρinv * V * W
+            fluxE_y = ρinv * V * (E + P)
+
+            fluxρ_z = W
+            fluxU_z = ρinv * W * U
+            fluxV_z = ρinv * W * V
+            fluxW_z = ρinv * W * W + P
+            fluxE_z = ρinv * W * (E + P)
+
+            s_F[i, j,  _ρ] = MJ * (ξx * fluxρ_x + ξy * fluxρ_y + ξz * fluxρ_z)
+            s_F[i, j,  _U] = MJ * (ξx * fluxU_x + ξy * fluxU_y + ξz * fluxU_z)
+            s_F[i, j,  _V] = MJ * (ξx * fluxV_x + ξy * fluxV_y + ξz * fluxV_z)
+            s_F[i, j,  _W] = MJ * (ξx * fluxW_x + ξy * fluxW_y + ξz * fluxW_z)
+            s_F[i, j,  _E] = MJ * (ξx * fluxE_x + ξy * fluxE_y + ξz * fluxE_z)
+
+            s_G[i, j,  _ρ] = MJ * (ηx * fluxρ_x + ηy * fluxρ_y + ηz * fluxρ_z)
+            s_G[i, j,  _U] = MJ * (ηx * fluxU_x + ηy * fluxU_y + ηz * fluxU_z)
+            s_G[i, j,  _V] = MJ * (ηx * fluxV_x + ηy * fluxV_y + ηz * fluxV_z)
+            s_G[i, j,  _W] = MJ * (ηx * fluxW_x + ηy * fluxW_y + ηz * fluxW_z)
+            s_G[i, j,  _E] = MJ * (ηx * fluxE_x + ηy * fluxE_y + ηz * fluxE_z)
+
+            r_Hρ = MJ * (ζx * fluxρ_x + ζy * fluxρ_y + ζz * fluxρ_z)
+            r_HU = MJ * (ζx * fluxU_x + ζy * fluxU_y + ζz * fluxU_z)
+            r_HV = MJ * (ζx * fluxV_x + ζy * fluxV_y + ζz * fluxV_z)
+            r_HW = MJ * (ζx * fluxW_x + ζy * fluxW_y + ζz * fluxW_z)
+            r_HE = MJ * (ζx * fluxE_x + ζy * fluxE_y + ζz * fluxE_z)
+
+            # one shared access per 10 flops
+            for n = 1:Nq
+                Dkn = s_D[k, n]
+
+                r_rhsρ[n] += Dkn * r_Hρ
+                r_rhsU[n] += Dkn * r_HU
+                r_rhsV[n] += Dkn * r_HV
+                r_rhsW[n] += Dkn * r_HW
+                r_rhsE[n] += Dkn * r_HE
+            end
+
+            r_rhsW[k] -= MJ * ρ * gravity
+
+            sync_threads()
+
+            # loop of ξ-grid lines
+            @unroll for n = 1:Nq
+                Dni = s_D[n, i]
+                Dnj = s_D[n, j]
+
+                r_rhsρ[k] += Dni * s_F[n, j, _ρ]
+                r_rhsρ[k] += Dnj * s_G[i, n, _ρ]
+
+                r_rhsU[k] += Dni * s_F[n, j, _U]
+                r_rhsU[k] += Dnj * s_G[i, n, _U]
+
+                r_rhsV[k] += Dni * s_F[n, j, _V]
+                r_rhsV[k] += Dnj * s_G[i, n, _V]
+
+                r_rhsW[k] += Dni * s_F[n, j, _W]
+                r_rhsW[k] += Dnj * s_G[i, n, _W]
+
+                r_rhsE[k] += Dni * s_F[n, j, _E]
+                r_rhsE[k] += Dnj * s_G[i, n, _E]
+            end
+        end # k
+
+        @unroll for k in 1:Nq
+            MJI = vgeo[i, j, k, _MJI, e]
+
+            # Updates are a performance bottleneck
+            # primary source of stall_long_sb
+            rhs[i, j, k, _U, e] += MJI*r_rhsU[k]
+            rhs[i, j, k, _V, e] += MJI*r_rhsV[k]
+            rhs[i, j, k, _W, e] += MJI*r_rhsW[k]
+            rhs[i, j, k, _ρ, e] += MJI*r_rhsρ[k]
+            rhs[i, j, k, _E, e] += MJI*r_rhsE[k]
+        end
+    end
+    return
+end
+
+function main()
+    DFloat = Float32
+    nelem = 240_000
+
+    rng = StableRNG(123)
+
+    Nq = N + 1
+    nvar = _nstate + nmoist + ntrace
+
+    Q = 1 .+ CuArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem))
+    Q[:, :, :, _E, :] .+= 20
+    vgeo = CuArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem))
+
+    # make sure the entries of the mass matrix satisfy the inverse relation
+    vgeo[:, :, :, _MJ, :] .+= 3
+    vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :]
+
+    D = CuArray(rand(rng, DFloat, Nq, Nq))
+
+    rhs = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem))
+
+    threads=(N+1, N+1)
+
+    kernel = @cuda launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem)
+    # XXX: should we print these for all kernels? maybe upload them to Codespeed?
+    @info """volumerhs! details:
+              - $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads
+              - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory,
+                $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
+                $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
+    results = @benchmark begin
+        CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
+                                         threads=$threads, blocks=$nelem)
+    end
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    CUDA.unsafe_free!(rhs)
+    CUDA.unsafe_free!(Q)
+    CUDA.unsafe_free!(vgeo)
+    CUDA.unsafe_free!(D)
+
+    results
+end
+
+end
+
+VolumeRHS.main()

From d2f63c8b109ec91bfed8056748a6b4ebf471eed0 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 17 Sep 2024 09:50:07 -0300
Subject: [PATCH 2/8] Add CI

---
 .buildkite/pipeline.yml         | 132 +++++++++++++++++++++++++++++++-
 .github/workflows/Benchmark.yml |  63 +++++++++++++++
 perf/Project.toml               |   1 +
 perf/runbenchmarks.jl           |   7 +-
 4 files changed, 197 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/Benchmark.yml

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 3cc99e534..ccfdda4ff 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -19,7 +19,12 @@ steps:
           queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
-        if: build.message !~ /\[skip tests\]/
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only julia\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip julia\]/
         timeout_in_minutes: 60
         matrix:
           setup:
@@ -46,7 +51,12 @@ steps:
           queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
-        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only special\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
         matrix:
           setup:
@@ -75,7 +85,12 @@ steps:
           queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
-        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only special\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
       - label: "Opaque pointers"
         plugins:
@@ -95,5 +110,114 @@ steps:
           queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
-        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only special\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
+
+  # we want to benchmark every commit on the master branch, even if it failed CI
+  - wait: ~
+    # continue_on_failure: true
+
+  - group: ":racehorse: Benchmarks"
+    steps:
+      - label: "Benchmarks"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1.10"
+        command: |
+          julia --project=perf -e '
+            using Pkg
+
+            println("--- :julia: Instantiating project")
+            Pkg.develop([PackageSpec(path=pwd())])
+            Pkg.instantiate()
+            push!(LOAD_PATH, @__DIR__)
+
+            println("+++ :julia: Benchmarking")
+            include("perf/runbenchmarks.jl")'
+        artifact_paths:
+          - "benchmarkresults.json"
+        agents:
+          queue: "juliaecosystem"
+          os: "macos"
+          arch: "aarch64"
+        if: |
+          build.message =~ /\[only benchmarks\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip benchmarks\]/
+        timeout_in_minutes: 30
+      # - label: "Benchmarks (dry run)"
+      #   plugins:
+      #     - JuliaCI/julia#v1:
+      #         version: "1.10"
+      #   command: |
+      #     julia --project -e '
+      #       using Pkg
+
+      #       println("--- :julia: Instantiating project")
+      #       Pkg.resolve()
+      #       Pkg.instantiate()
+      #       Pkg.activate("perf")
+      #       Pkg.resolve()
+      #       Pkg.instantiate()
+      #       push!(LOAD_PATH, @__DIR__)
+
+      #       println("+++ :julia: Benchmarking")
+      #       include("perf/runbenchmarks.jl")'
+      #   artifact_paths:
+      #     - "results.json"
+      #   agents:
+      #     queue: "juliaecosystem"
+      #     os: "macos"
+      #     arch: "aarch64"
+      #   if: |
+      #     build.message =~ /\[only benchmarks\]/ ||
+      #     build.message !~ /\[only/ && !build.pull_request.draft &&
+      #       build.message !~ /\[skip benchmarks\]/
+      #   timeout_in_minutes: 30
+
+      # if we will submit results, use the benchmark queue so that we will
+      # be running on the same system each time
+      # - label: "Benchmarks on Julia {{matrix.julia}}"
+      #   plugins:
+      #     - JuliaCI/julia#v1:
+      #         version: "{{matrix.julia}}"
+      #   env:
+      #     BENCHMARKS: "true"
+      #     CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
+      #     CODESPEED_BRANCH: "$BUILDKITE_BRANCH"
+      #     CODESPEED_COMMIT: "$BUILDKITE_COMMIT"
+      #     CODESPEED_EXECUTABLE: "Julia {{matrix.julia}}"
+      #   command: |
+      #     julia --project -e '
+      #       using Pkg
+      #       ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]
+
+      #       println("--- :julia: Instantiating project")
+      #       Pkg.resolve()
+      #       Pkg.instantiate()
+      #       Pkg.activate("perf")
+      #       Pkg.resolve()
+      #       Pkg.instantiate()
+      #       push!(LOAD_PATH, @__DIR__)
+
+      #       println("+++ :julia: Benchmarking")
+      #       include("perf/runbenchmarks.jl")'
+      #   agents:
+      #     queue: "benchmark"
+      #     gpu: "rtx2070"
+      #     cuda: "*"
+      #   if: |
+      #     build.branch =~ /^master$$/ && build.message =~ /\[only benchmarks\]/ ||
+      #     build.branch =~ /^master$$/ && build.message !~ /\[only/ &&
+      #       build.message !~ /\[skip benchmarks\]/
+      #   matrix:
+      #     setup:
+      #       julia:
+      #         - "1.10"
+      #         - "1.11"
+      #   timeout_in_minutes: 30
diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
new file mode 100644
index 000000000..9c43a8417
--- /dev/null
+++ b/.github/workflows/Benchmark.yml
@@ -0,0 +1,63 @@
+name: Benchmarks
+permissions:
+  contents: write # contents permission to update benchmark contents in gh-pages branch
+  statuses: read
+  deployments: write # deployments permission to deploy GitHub pages website
+  pull-requests: write
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/**/*"
+      - "ext/**/*"
+      - "perf/**/*"
+      - ".buildkite/**/*"
+      - "Project.toml"
+      - ".github/workflows/Benchmark.yml"
+  push:
+    branches:
+      - main
+    paths:
+      - "src/**/*"
+      - "ext/**/*"
+      - "benchmarks/**/*"
+      - ".buildkite/**/*"
+      - "Project.toml"
+      - ".github/workflows/Benchmark.yml"
+
+jobs:
+  benchmark:
+    if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download Buildkite Artifacts
+        id: download
+        uses: EnricoMi/download-buildkite-artifact-action@v1
+        with:
+          buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
+          ignore_build_states: blocked,canceled,skipped,not_run,failed
+          ignore_job_states: timed_out,failed
+          output_path: artifacts
+
+      - name: Locate Benchmarks Artifact
+        id: locate
+        if: ${{ steps.download.outputs.download-state == 'success' }}
+        run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT
+
+      - name: Upload Benchmark Results
+        if: ${{ steps.locate.outputs.path != '' }}
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: Metal Benchmarks
+          tool: "julia"
+          output-file-path: ${{ steps.locate.outputs.path }}
+          benchmark-data-dir-path: ""
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          comment-always: true
+          summary-always: true
+          alert-threshold: "150%"
+          fail-on-alert: false
+          auto-push: ${{ github.event_name != 'pull_request' }}
diff --git a/perf/Project.toml b/perf/Project.toml
index 8314c7b28..decfbe75f 100644
--- a/perf/Project.toml
+++ b/perf/Project.toml
@@ -2,5 +2,6 @@
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 5969dcd9b..e3c6c6e9f 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -14,6 +14,9 @@ if real_run
     BenchmarkTools.DEFAULT_PARAMETERS.evals = 0
 end
 
+# print system information
+@info "System information:\n" * sprint(io->Metal.versioninfo(io))
+
 # convenience macro to create a benchmark that requires synchronizing the GPU
 macro async_benchmarkable(ex...)
     quote
@@ -66,7 +69,7 @@ println(results)
 ## comparison
 
 # write out the results
-BenchmarkTools.save(joinpath(@__DIR__, "results.json"), results)
+BenchmarkTools.save("benchmarkresults.json", results)
 
 # compare against previous results
 # TODO: store these results so that we can compare when benchmarking PRs
@@ -85,7 +88,7 @@ end
 
 ## submission
 
-using JSON, HTTP
+# using JSON, HTTP
 
 if real_run
     @info "Submitting to Codespeed..."

From 9175ab8254ec2e4f9e4b9e6616d14e661a734f94 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 17 Sep 2024 10:26:56 -0300
Subject: [PATCH 3/8] Adapt for Metal

[only benchmarks]
---
 perf/array.jl         | 48 ++++++++++++------------
 perf/byval.jl         | 33 ++++++++---------
 perf/kernel.jl        | 36 +++++++++---------
 perf/latency.jl       | 12 +++---
 perf/metal.jl         | 14 ++-----
 perf/metaldevrt.jl    | 26 ++++++-------
 perf/runbenchmarks.jl | 78 +++++++--------------------------------
 perf/volumerhs.jl     | 86 ++++++++++++++++++++++---------------------
 8 files changed, 139 insertions(+), 194 deletions(-)

diff --git a/perf/array.jl b/perf/array.jl
index a6a226ccf..0c57a7dfa 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -5,16 +5,16 @@ const n = 1000
 
 # generate some arrays
 cpu_mat = rand(rng, Float32, m, n)
-gpu_mat = CuArray{Float32}(undef, size(cpu_mat))
+gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
 gpu_vec = reshape(gpu_mat, length(gpu_mat))
 gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
 gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
-gpu_mat_ints = CuArray(rand(rng, Int, m, n))
+gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
 gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
-gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
+gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
 gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
 
-group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
+group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)
 
 group["copy"] = @async_benchmarkable copy($gpu_mat)
 
@@ -26,7 +26,7 @@ let group = addgroup!(group, "copyto!")
 end
 
 let group = addgroup!(group, "iteration")
-    group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
+    group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
 
     group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
 
@@ -46,12 +46,12 @@ let group = addgroup!(group, "iteration")
     end
 end
 
-let group = addgroup!(group, "reverse")
-    group["1d"] = @async_benchmarkable reverse($gpu_vec)
-    group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
-    group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
-    group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
-end
+# let group = addgroup!(group, "reverse")
+#     group["1d"] = @async_benchmarkable reverse($gpu_vec)
+#     group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
+#     group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
+#     group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
+# end
 
 group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
 
@@ -77,31 +77,31 @@ end
 
 let group = addgroup!(group, "random")
     let group = addgroup!(group, "rand")
-        group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
-        group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
+        group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
+        group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
     end
 
     let group = addgroup!(group, "rand!")
-        group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
-        group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
+        group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
+        group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
     end
 
     let group = addgroup!(group, "randn")
-        group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
-        #group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n)
+        group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
+        # group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
     end
 
     let group = addgroup!(group, "randn!")
-        group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
-        #group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints)
+        group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
+        # group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
     end
 end
 
-let group = addgroup!(group, "sorting")
-    group["1d"] = @async_benchmarkable sort($gpu_vec)
-    group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
-    group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
-end
+# let group = addgroup!(group, "sorting")
+#     group["1d"] = @async_benchmarkable sort($gpu_vec)
+#     group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+#     group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+# end
 
 let group = addgroup!(group, "permutedims")
     group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
diff --git a/perf/byval.jl b/perf/byval.jl
index d32d62a9c..6a5466418 100644
--- a/perf/byval.jl
+++ b/perf/byval.jl
@@ -1,13 +1,12 @@
 module ByVal
 
-using CUDA, BenchmarkTools, Random
-using CUDA: i32
+using Metal, BenchmarkTools, Random
 
 const threads = 256
 
 # simple add matrixes kernel
 function kernel_add_mat(n, x1, x2, y)
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    i = thread_position_in_grid_1d()
     if i <= n
         @inbounds y[i] = x1[i] + x2[i]
     end
@@ -20,8 +19,8 @@ end
 
 # add arrays of matrixes kernel
 function kernel_add_mat_z_slices(n, vararg...)
-    x1, x2, y = get_inputs3(blockIdx().y, vararg...)
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...)
+    i = thread_position_in_grid_1d()
     if i <= n
         @inbounds y[i] = x1[i] + x2[i]
     end
@@ -30,15 +29,15 @@ end
 
 function add_z_slices!(y, x1, x2)
     m1, n1 = size(x1[1]) #get size of first slice
-    blocks = (m1 * n1 + threads - 1) ÷ threads
-    # get length(x1) more blocks than needed to process 1 slice
-    @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
+    groups = (m1 * n1 + threads - 1) ÷ threads
+    # get length(x1) more groups than needed to process 1 slice
+    @metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
 end
 
 function add!(y, x1, x2)
     m1, n1 = size(x1)
-    blocks = (m1 * n1 + threads - 1) ÷ threads
-    @cuda blocks = blocks, 1          threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
+    groups = (m1 * n1 + threads - 1) ÷ threads
+    @metal groups = (groups, 1)  threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
 end
 
 function main()
@@ -54,22 +53,22 @@ function main()
     m, n = 3072, 1536    # 256 multiplier
     #m, n = 6007, 3001    # prime numbers to test memory access correctness
 
-    x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
-    x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
     y1 = [similar(x1[1]) for i = 1:num_z_slices]
 
     # reference down to bones add on GPU
-    results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
+    results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1])
 
     # adding arrays in an array
     for slices = 1:num_z_slices
-        results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+        results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
     end
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
-    CUDA.unsafe_free!.(x1)
-    CUDA.unsafe_free!.(x2)
-    CUDA.unsafe_free!.(y1)
+    Metal.unsafe_free!.(x1)
+    Metal.unsafe_free!.(x2)
+    Metal.unsafe_free!.(y1)
 
     return results
 end
diff --git a/perf/kernel.jl b/perf/kernel.jl
index 1022a774a..5cfcc9242 100644
--- a/perf/kernel.jl
+++ b/perf/kernel.jl
@@ -1,33 +1,35 @@
-using CUDA: i32
+# using GPUArrays
 
 group = addgroup!(SUITE, "kernel")
 
-group["launch"] = @benchmarkable @cuda identity(nothing)
+group["launch"] = @benchmarkable @metal identity(nothing)
 
-group["occupancy"] = @benchmarkable begin
-    kernel = @cuda launch=false identity(nothing)
-    launch_configuration(kernel.fun)
-end
+# group["occupancy"] = @benchmarkable begin
+#     kernel = @metal launch=false identity(nothing)
+#     GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1)
+#     return
+# end
 
-src = CUDA.rand(Float32, 512, 1000)
+src = Metal.rand(Float32, 512, 1000)
 dest = similar(src)
 function indexing_kernel(dest, src)
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    i = thread_position_in_grid_1d()
     @inbounds dest[i] = src[i]
     return
 end
-group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src)
+group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src)
 
 function checked_indexing_kernel(dest, src)
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    i = thread_position_in_grid_1d()
     dest[i] = src[i]
     return
 end
-group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
+group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src)
 
-function rand_kernel(dest::AbstractArray{T}) where {T}
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
-    dest[i] = rand(T)
-    return
-end
-group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)
+## DELETE
+# function rand_kernel(dest::AbstractArray{T}) where {T}
+#     i = thread_position_in_grid_1d()
+#     dest[i] = Metal.rand(T)
+#     return
+# end
+# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest)
diff --git a/perf/latency.jl b/perf/latency.jl
index 9cd201353..a1066abe8 100644
--- a/perf/latency.jl
+++ b/perf/latency.jl
@@ -1,6 +1,6 @@
 module Latency
 
-using CUDA
+using Metal
 using BenchmarkTools
 
 function main()
@@ -11,24 +11,24 @@ function main()
         base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
     end
     # NOTE: we don't ust Base.active_project() here because of how CI launches this script,
-    #       starting with --project in the main CUDA.jl project.
+    #       starting with --project in the main Metal.jl project.
 
     # time to precompile the package and its dependencies
     precompile_cmd =
-        `$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
+        `$base_cmd -e "pkg = Base.identify_package(\"Metal\")
                        Base.compilecache(pkg)"`
     results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
 
     # time to actually import the package
     import_cmd =
-        `$base_cmd -e "using CUDA"`
+        `$base_cmd -e "using Metal"`
     results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
 
     # time to actually compile a kernel
     ttfp_cmd =
-        `$base_cmd -e "using CUDA
+        `$base_cmd -e "using Metal
                        kernel() = return
-                       CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
+                       Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"`
     results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
 
     results
diff --git a/perf/metal.jl b/perf/metal.jl
index 5dbcbf0ab..5e136b0ed 100644
--- a/perf/metal.jl
+++ b/perf/metal.jl
@@ -1,14 +1,6 @@
-group = addgroup!(SUITE, "cuda")
+group = addgroup!(SUITE, "metal")
 
 let group = addgroup!(group, "synchronization")
-    let group = addgroup!(group, "stream")
-        group["blocking"] = @benchmarkable synchronize(blocking=true)
-        group["auto"] = @benchmarkable synchronize()
-        group["nonblocking"] = @benchmarkable synchronize(spin=false)
-    end
-    let group = addgroup!(group, "context")
-        group["blocking"] = @benchmarkable device_synchronize(blocking=true)
-        group["auto"] = @benchmarkable device_synchronize()
-        group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
-    end
+    group["stream"] = @benchmarkable synchronize()
+    group["context"] = @benchmarkable device_synchronize()
 end
diff --git a/perf/metaldevrt.jl b/perf/metaldevrt.jl
index 08348fab5..a3dbd07fc 100644
--- a/perf/metaldevrt.jl
+++ b/perf/metaldevrt.jl
@@ -1,14 +1,14 @@
-module cudadevrt
+module metaldevrt
 
-using CUDA, BenchmarkTools, Random
+using Metal, BenchmarkTools, Random
 
 const threads = 256
 #simple add matrix and vector kernel
 function kernel_add_mat_vec(m, x1, x2, y)
     # one block per column
-    offset = (blockIdx().x-1) * m
-    @inbounds xtmp = x2[blockIdx().x]
-    for i = threadIdx().x : blockDim().x : m
+    offset = (threadgroup_position_in_grid_2d().x-1) * m
+    @inbounds xtmp = x2[threadgroup_position_in_grid_2d().x]
+    for i = thread_position_in_threadgroup_2d().x : threadgroups_per_grid_2d().x : m
         @inbounds y[offset + i] = x1[offset + i] + xtmp
     end
     return
@@ -16,27 +16,27 @@ end
 
 function add!(y, x1, x2)
     m, n = size(x1)
-    @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
+    @metal groups = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
 end
 
 function main()
     Random.seed!(1)
     m, n = 3072, 1536    # 256 multiplier
-    x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
-    x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
+    x1 = mtl(randn(Float32, (m, n)) .+ Float32(0.5))
+    x2 = mtl(randn(Float32, (1, n)) .+ Float32(0.5))
     y1 = similar(x1)
 
-    results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
+    results = @benchmark Metal.@sync add!($y1, $x1, $x2)
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
-    CUDA.unsafe_free!(x1)
-    CUDA.unsafe_free!(x2)
-    CUDA.unsafe_free!(y1)
+    Metal.unsafe_free!(x1)
+    Metal.unsafe_free!(x2)
+    Metal.unsafe_free!(y1)
 
     return results
 end
 
 end
 
-cudadevrt.main()
+metaldevrt.main()
 
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index e3c6c6e9f..549b01c9c 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -1,18 +1,14 @@
 # benchmark suite execution and codespeed submission
 
-using CUDA
+using Metal
 
 using BenchmarkTools
 
 using StableRNGs
 rng = StableRNG(123)
 
-# we only submit results when running on the master branch
-real_run = get(ENV, "CODESPEED_BRANCH", nothing) == "master"
-if real_run
-    # to find untuned benchmarks
-    BenchmarkTools.DEFAULT_PARAMETERS.evals = 0
-end
+# to find untuned benchmarks
+BenchmarkTools.DEFAULT_PARAMETERS.evals = 0
 
 # print system information
 @info "System information:\n" * sprint(io->Metal.versioninfo(io))
@@ -20,7 +16,7 @@ end
 # convenience macro to create a benchmark that requires synchronizing the GPU
 macro async_benchmarkable(ex...)
     quote
-        @benchmarkable CUDA.@sync blocking=true $(ex...)
+        @benchmarkable Metal.@sync $(ex...)
     end
 end
 
@@ -33,19 +29,18 @@ SUITE = BenchmarkGroup()
 
 # NOTE: don't use spaces in benchmark names (tobami/codespeed#256)
 
-include("cuda.jl")
+include("metal.jl")
 include("kernel.jl")
 include("array.jl")
 
-if real_run
-    @info "Preparing main benchmarks"
-    warmup(SUITE; verbose=false)
-    tune!(SUITE)
+@info "Preparing main benchmarks"
+warmup(SUITE; verbose=false)
+tune!(SUITE)
 
-    # reclaim memory that might have been used by the tuning process
-    GC.gc(true)
-    CUDA.reclaim()
-end
+# reclaim memory that might have been used by the tuning process
+GC.gc(true)
+GC.gc(true)
+GC.gc(true)
 
 # benchmark groups that aren't part of the suite
 addgroup!(SUITE, "integration")
@@ -56,9 +51,9 @@ results = run(SUITE, verbose=true)
 # integration tests (that do nasty things, so need to be run last)
 @info "Running integration benchmarks"
 integration_results = BenchmarkGroup()
-integration_results["volumerhs"] = include("volumerhs.jl")
+# integration_results["volumerhs"] = include("volumerhs.jl")
 integration_results["byval"] = include("byval.jl")
-integration_results["cudadevrt"] = include("cudadevrt.jl")
+integration_results["metaldevrt"] = include("metaldevrt.jl")
 
 results["latency"] = latency_results
 results["integration"] = integration_results
@@ -84,48 +79,3 @@ if ispath(reference_path)
     println("Regressions:")
     println(regressions(comparison))
 end
-
-
-## submission
-
-# using JSON, HTTP
-
-if real_run
-    @info "Submitting to Codespeed..."
-
-    basedata = Dict(
-        "branch"        => ENV["CODESPEED_BRANCH"],
-        "commitid"      => ENV["CODESPEED_COMMIT"],
-        "project"       => ENV["CODESPEED_PROJECT"],
-        "environment"   => ENV["CODESPEED_ENVIRONMENT"],
-        "executable"    => ENV["CODESPEED_EXECUTABLE"]
-    )
-
-    # convert nested groups of benchmark to flat dictionaries of results
-    flat_results = []
-    function flatten(results, prefix="")
-        for (key,value) in results
-            if value isa BenchmarkGroup
-                flatten(value, "$prefix$key/")
-            else
-                @assert value isa BenchmarkTools.Trial
-
-                # codespeed reports maxima, but those are often very noisy.
-                # get rid of measurements that unnecessarily skew the distribution.
-                rmskew!(value)
-
-                push!(flat_results,
-                    Dict(basedata...,
-                        "benchmark" => "$prefix$key",
-                        "result_value" => median(value).time / 1e9,
-                        "min" => minimum(value).time / 1e9,
-                        "max" => maximum(value).time / 1e9))
-            end
-        end
-    end
-    flatten(results)
-
-    HTTP.post("$(ENV["CODESPEED_SERVER"])/result/add/json/",
-                ["Content-Type" => "application/x-www-form-urlencoded"],
-                HTTP.URIs.escapeuri(Dict("json" => JSON.json(flat_results))))
-end
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
index 5c7737f57..f5b3b6d35 100644
--- a/perf/volumerhs.jl
+++ b/perf/volumerhs.jl
@@ -1,7 +1,7 @@
 module VolumeRHS
 
 using BenchmarkTools
-using CUDA
+using Metal
 using StableRNGs
 using StaticArrays
 
@@ -20,16 +20,16 @@ end
 
 # HACK: module-local versions of core arithmetic; needed to get FMA
 for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
-    for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
-        ir = """
-            %x = f$f contract nsz $llvmT %0, %1
-            ret $llvmT %x
-        """
-        @eval begin
-            # the @pure is necessary so that we can constant propagate.
-            @inline Base.@pure function $jlf(a::$T, b::$T)
-                Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
-            end
+    T = :Float32
+    llvmT = "float"
+    ir = """
+        %x = f$f contract nsz $llvmT %0, %1
+        ret $llvmT %x
+    """
+    @eval begin
+        # the @pure is necessary so that we can constant propagate.
+        @inline Base.@pure function $jlf(a::$T, b::$T)
+            Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
         end
     end
     @eval function $jlf(args...)
@@ -38,16 +38,16 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
 end
 
 let (jlf, f) = (:div_arcp, :div)
-    for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
-        ir = """
-            %x = f$f fast $llvmT %0, %1
-            ret $llvmT %x
-        """
-        @eval begin
-            # the @pure is necessary so that we can constant propagate.
-            @inline Base.@pure function $jlf(a::$T, b::$T)
-                Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
-            end
+    T = :Float32
+    llvmT = "float"
+    ir = """
+        %x = f$f fast $llvmT %0, %1
+        ret $llvmT %x
+    """
+    @eval begin
+        # the @pure is necessary so that we can constant propagate.
+        @inline Base.@pure function $jlf(a::$T, b::$T)
+            Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
         end
     end
     @eval function $jlf(args...)
@@ -77,8 +77,10 @@ const N = 4
 const nmoist = 0
 const ntrace = 0
 
-Base.@irrational grav 9.81 BigFloat(9.81)
-Base.@irrational gdm1 0.4 BigFloat(0.4)
+# Base.@irrational grav 9.81 BigFloat(9.81)
+const grav = 9.81f0
+# Base.@irrational gdm1 0.4 BigFloat(0.4)
+const gdm1 = 0.4f0
 
 function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
     Q = Base.Experimental.Const(Q)
@@ -99,9 +101,9 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
     r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef)
     r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef)
 
-    e = blockIdx().x
-    j = threadIdx().y
-    i = threadIdx().x
+    e = threadgroup_position_in_grid_2d().x
+    j = thread_position_in_threadgroup_2d().y
+    i = thread_position_in_threadgroup_2d().x
 
     @inbounds begin
         for k in 1:Nq
@@ -233,37 +235,37 @@ function main()
     Nq = N + 1
     nvar = _nstate + nmoist + ntrace
 
-    Q = 1 .+ CuArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem))
+    Q = 1 .+ MtlArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem))
     Q[:, :, :, _E, :] .+= 20
-    vgeo = CuArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem))
+    vgeo = MtlArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem))
 
     # make sure the entries of the mass matrix satisfy the inverse relation
     vgeo[:, :, :, _MJ, :] .+= 3
     vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :]
 
-    D = CuArray(rand(rng, DFloat, Nq, Nq))
+    D = MtlArray(rand(rng, DFloat, Nq, Nq))
 
-    rhs = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem))
+    rhs = MtlArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem))
 
     threads=(N+1, N+1)
 
-    kernel = @cuda launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem)
+    kernel = @metal launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem)
     # XXX: should we print these for all kernels? maybe upload them to Codespeed?
-    @info """volumerhs! details:
-              - $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads
-              - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory,
-                $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
-                $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
+    # @info """volumerhs! details:
+    #           - $(Metal.registers(kernel)) registers, max $(Metal.maxthreads(kernel)) threads
+    #           - $(Base.format_bytes(Metal.memory(kernel).local)) local memory,
+    #             $(Base.format_bytes(Metal.memory(kernel).shared)) shared memory,
+    #             $(Base.format_bytes(Metal.memory(kernel).constant)) constant memory"""
     results = @benchmark begin
-        CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
-                                         threads=$threads, blocks=$nelem)
+        Metal.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
+                                         threads=$threads, groups=$nelem)
     end
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
-    CUDA.unsafe_free!(rhs)
-    CUDA.unsafe_free!(Q)
-    CUDA.unsafe_free!(vgeo)
-    CUDA.unsafe_free!(D)
+    Metal.unsafe_free!(rhs)
+    Metal.unsafe_free!(Q)
+    Metal.unsafe_free!(vgeo)
+    Metal.unsafe_free!(D)
 
     results
 end

From cc8de561c65d96a9a617fbc4cbf15c82a1f81538 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 19 Sep 2024 10:08:49 -0300
Subject: [PATCH 4/8] Save median of results [only benchmarks]

---
 perf/runbenchmarks.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 549b01c9c..49078a643 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -64,7 +64,7 @@ println(results)
 ## comparison
 
 # write out the results
-BenchmarkTools.save("benchmarkresults.json", results)
+BenchmarkTools.save("benchmarkresults.json", median(results))
 
 # compare against previous results
 # TODO: store these results so that we can compare when benchmarking PRs

From a448039110778aab9107fd817560648d78386d70 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 19 Sep 2024 10:32:24 -0300
Subject: [PATCH 5/8] Cleanup

---
 .buildkite/pipeline.yml | 71 -----------------------------------------
 1 file changed, 71 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index ccfdda4ff..9dbcda8f8 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -150,74 +150,3 @@ steps:
           build.message !~ /\[only/ && !build.pull_request.draft &&
             build.message !~ /\[skip benchmarks\]/
         timeout_in_minutes: 30
-      # - label: "Benchmarks (dry run)"
-      #   plugins:
-      #     - JuliaCI/julia#v1:
-      #         version: "1.10"
-      #   command: |
-      #     julia --project -e '
-      #       using Pkg
-
-      #       println("--- :julia: Instantiating project")
-      #       Pkg.resolve()
-      #       Pkg.instantiate()
-      #       Pkg.activate("perf")
-      #       Pkg.resolve()
-      #       Pkg.instantiate()
-      #       push!(LOAD_PATH, @__DIR__)
-
-      #       println("+++ :julia: Benchmarking")
-      #       include("perf/runbenchmarks.jl")'
-      #   artifact_paths:
-      #     - "results.json"
-      #   agents:
-      #     queue: "juliaecosystem"
-      #     os: "macos"
-      #     arch: "aarch64"
-      #   if: |
-      #     build.message =~ /\[only benchmarks\]/ ||
-      #     build.message !~ /\[only/ && !build.pull_request.draft &&
-      #       build.message !~ /\[skip benchmarks\]/
-      #   timeout_in_minutes: 30
-
-      # if we will submit results, use the benchmark queue so that we will
-      # be running on the same system each time
-      # - label: "Benchmarks on Julia {{matrix.julia}}"
-      #   plugins:
-      #     - JuliaCI/julia#v1:
-      #         version: "{{matrix.julia}}"
-      #   env:
-      #     BENCHMARKS: "true"
-      #     CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
-      #     CODESPEED_BRANCH: "$BUILDKITE_BRANCH"
-      #     CODESPEED_COMMIT: "$BUILDKITE_COMMIT"
-      #     CODESPEED_EXECUTABLE: "Julia {{matrix.julia}}"
-      #   command: |
-      #     julia --project -e '
-      #       using Pkg
-      #       ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]
-
-      #       println("--- :julia: Instantiating project")
-      #       Pkg.resolve()
-      #       Pkg.instantiate()
-      #       Pkg.activate("perf")
-      #       Pkg.resolve()
-      #       Pkg.instantiate()
-      #       push!(LOAD_PATH, @__DIR__)
-
-      #       println("+++ :julia: Benchmarking")
-      #       include("perf/runbenchmarks.jl")'
-      #   agents:
-      #     queue: "benchmark"
-      #     gpu: "rtx2070"
-      #     cuda: "*"
-      #   if: |
-      #     build.branch =~ /^master$$/ && build.message =~ /\[only benchmarks\]/ ||
-      #     build.branch =~ /^master$$/ && build.message !~ /\[only/ &&
-      #       build.message !~ /\[skip benchmarks\]/
-      #   matrix:
-      #     setup:
-      #       julia:
-      #         - "1.10"
-      #         - "1.11"
-      #   timeout_in_minutes: 30

From 57f004f2d6f6b5b5d191f8158aa06575627ba75e Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 19 Sep 2024 10:41:19 -0300
Subject: [PATCH 6/8] Don't disable main tests when a PR is a draft

---
 .buildkite/pipeline.yml         | 2 +-
 .github/workflows/Benchmark.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 9dbcda8f8..45471a42b 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -22,7 +22,7 @@ steps:
         if: |
           build.message =~ /\[only tests\]/ ||
           build.message =~ /\[only julia\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
+          build.message !~ /\[only/ &&
             build.message !~ /\[skip tests\]/ &&
             build.message !~ /\[skip julia\]/
         timeout_in_minutes: 60
diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
index 9c43a8417..1904c1640 100644
--- a/.github/workflows/Benchmark.yml
+++ b/.github/workflows/Benchmark.yml
@@ -29,7 +29,7 @@ on:
 
 jobs:
   benchmark:
-    if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }}
+    if: ${{ contains(github.event.head_commit.message, '[only benchmarks]') || !contains(github.event.head_commit.message, '[only') && !contains(github.event.head_commit.message, '[skip benchmarks]') && github.event.pull_request.draft == false }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4

From c3a6fb37f267ed3a81218c65ffe32e68e32586fb Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 24 Sep 2024 01:13:47 -0300
Subject: [PATCH 7/8] Benchmark on macOS 15 to catch potential performance
 impacts of new features.

---
 .buildkite/pipeline.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 45471a42b..fef716300 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -142,9 +142,10 @@ steps:
         artifact_paths:
           - "benchmarkresults.json"
         agents:
-          queue: "juliaecosystem"
+          queue: "julia"
           os: "macos"
           arch: "aarch64"
+          macos_version: "15.0"
         if: |
           build.message =~ /\[only benchmarks\]/ ||
           build.message !~ /\[only/ && !build.pull_request.draft &&

From cfd94990af685a94f90514a1592b5e7226edc47e Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 24 Sep 2024 08:26:23 -0300
Subject: [PATCH 8/8] Use juliaecosystem runner for macOS 15

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index fef716300..b35a38233 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -142,7 +142,7 @@ steps:
         artifact_paths:
           - "benchmarkresults.json"
         agents:
-          queue: "julia"
+          queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
           macos_version: "15.0"