From 87f3862db8bfcde9a7921f456007368bf789cd81 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:59:48 -0300 Subject: [PATCH 1/8] Copy-paste CUDA benchmarks --- perf/.gitignore | 2 + perf/Project.toml | 6 + perf/array.jl | 110 +++++++++++++++++ perf/byval.jl | 79 ++++++++++++ perf/kernel.jl | 33 +++++ perf/latency.jl | 39 ++++++ perf/metal.jl | 14 +++ perf/metaldevrt.jl | 42 +++++++ perf/runbenchmarks.jl | 128 ++++++++++++++++++++ perf/volumerhs.jl | 273 ++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 726 insertions(+) create mode 100644 perf/.gitignore create mode 100644 perf/Project.toml create mode 100644 perf/array.jl create mode 100644 perf/byval.jl create mode 100644 perf/kernel.jl create mode 100644 perf/latency.jl create mode 100644 perf/metal.jl create mode 100644 perf/metaldevrt.jl create mode 100644 perf/runbenchmarks.jl create mode 100644 perf/volumerhs.jl diff --git a/perf/.gitignore b/perf/.gitignore new file mode 100644 index 000000000..124aa781c --- /dev/null +++ b/perf/.gitignore @@ -0,0 +1,2 @@ +results.json +reference.json diff --git a/perf/Project.toml b/perf/Project.toml new file mode 100644 index 000000000..8314c7b28 --- /dev/null +++ b/perf/Project.toml @@ -0,0 +1,6 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" diff --git a/perf/array.jl b/perf/array.jl new file mode 100644 index 000000000..a6a226ccf --- /dev/null +++ b/perf/array.jl @@ -0,0 +1,110 @@ +group = addgroup!(SUITE, "array") + +const m = 512 +const n = 1000 + +# generate some arrays +cpu_mat = rand(rng, Float32, m, n) +gpu_mat = CuArray{Float32}(undef, size(cpu_mat)) +gpu_vec = reshape(gpu_mat, length(gpu_mat)) +gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) +gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) +gpu_mat_ints = CuArray(rand(rng, Int, m, n)) +gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) +gpu_mat_bools = CuArray(rand(rng, Bool, m, n)) +gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) + +group["construct"] = @benchmarkable CuArray{Int}(undef, 1) + +group["copy"] = @async_benchmarkable copy($gpu_mat) + +gpu_mat2 = copy(gpu_mat) +let group = addgroup!(group, "copyto!") + group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat) + group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat) + group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat) +end + +let group = addgroup!(group, "iteration") + group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10] + + group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] + + let group = addgroup!(group, "findall") + group["bool"] = @benchmarkable findall($gpu_vec_bools) + group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints) + end + + let group = addgroup!(group, "findfirst") + group["bool"] = @benchmarkable findfirst($gpu_vec_bools) + group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints) + end + + let group = addgroup!(group, "findmin") # findmax + group["1d"] = @async_benchmarkable findmin($gpu_vec) + group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1) + end +end + +let group = addgroup!(group, "reverse") + group["1d"] = @async_benchmarkable reverse($gpu_vec) + group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) + group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) + group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) +end + +group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 + +# no need to test inplace version, which performs the same operation (but with an alloc) +let group = addgroup!(group, "accumulate") + group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec) + group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1) +end + +let group = addgroup!(group, "reductions") + let group = addgroup!(group, "reduce") + group["1d"] = @async_benchmarkable reduce(+, $gpu_vec) + group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1) + end + + let group = addgroup!(group, "mapreduce") + group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec) + group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1) + end + + # used by sum, prod, minimum, maximum, all, any, count +end + +let group = addgroup!(group, "random") + let group = addgroup!(group, "rand") + group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n) + group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n) + end + + let group = addgroup!(group, "rand!") + group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec) + group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints) + end + + let group = addgroup!(group, "randn") + group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n) + #group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n) + end + + let group = addgroup!(group, "randn!") + group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec) + #group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints) + end +end + +let group = addgroup!(group, "sorting") + group["1d"] = @async_benchmarkable sort($gpu_vec) + group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) + group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) +end + +let group = addgroup!(group, "permutedims") + group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1)) + group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2)) + group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3)) +end diff --git a/perf/byval.jl b/perf/byval.jl new file mode 100644 index 000000000..d32d62a9c --- /dev/null +++ b/perf/byval.jl @@ -0,0 +1,79 @@ +module ByVal + +using CUDA, BenchmarkTools, Random +using CUDA: i32 + +const threads = 256 + +# simple add matrixes kernel +function kernel_add_mat(n, x1, x2, y) + i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + if i <= n + @inbounds y[i] = x1[i] + x2[i] + end + return +end + +@inline get_inputs3(indx_y, a, b, c) = (a, b, c) +@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2) +@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3) + +# add arrays of matrixes kernel +function kernel_add_mat_z_slices(n, vararg...) + x1, x2, y = get_inputs3(blockIdx().y, vararg...) + i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + if i <= n + @inbounds y[i] = x1[i] + x2[i] + end + return +end + +function add_z_slices!(y, x1, x2) + m1, n1 = size(x1[1]) #get size of first slice + blocks = (m1 * n1 + threads - 1) ÷ threads + # get length(x1) more blocks than needed to process 1 slice + @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) +end + +function add!(y, x1, x2) + m1, n1 = size(x1) + blocks = (m1 * n1 + threads - 1) ÷ threads + @cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y) +end + +function main() + results = BenchmarkGroup() + + num_z_slices = 3 + Random.seed!(1) + + #m, n = 7, 5 # tiny to measure overhead + #m, n = 521, 111 + #m, n = 1521, 1111 + #m, n = 3001, 1511 # prime numbers to test memory access correctness + m, n = 3072, 1536 # 256 multiplier + #m, n = 6007, 3001 # prime numbers to test memory access correctness + + x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] + x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] + y1 = [similar(x1[1]) for i = 1:num_z_slices] + + # reference down to bones add on GPU + results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1]) + + # adding arrays in an array + for slices = 1:num_z_slices + results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) + end + + # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them + CUDA.unsafe_free!.(x1) + CUDA.unsafe_free!.(x2) + CUDA.unsafe_free!.(y1) + + return results +end + +end + +ByVal.main() diff --git a/perf/kernel.jl b/perf/kernel.jl new file mode 100644 index 000000000..1022a774a --- /dev/null +++ b/perf/kernel.jl @@ -0,0 +1,33 @@ +using CUDA: i32 + +group = addgroup!(SUITE, "kernel") + +group["launch"] = @benchmarkable @cuda identity(nothing) + +group["occupancy"] = @benchmarkable begin + kernel = @cuda launch=false identity(nothing) + launch_configuration(kernel.fun) +end + +src = CUDA.rand(Float32, 512, 1000) +dest = similar(src) +function indexing_kernel(dest, src) + i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + @inbounds dest[i] = src[i] + return +end +group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src) + +function checked_indexing_kernel(dest, src) + i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + dest[i] = src[i] + return +end +group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src) + +function rand_kernel(dest::AbstractArray{T}) where {T} + i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + dest[i] = rand(T) + return +end +group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest) diff --git a/perf/latency.jl b/perf/latency.jl new file mode 100644 index 000000000..9cd201353 --- /dev/null +++ b/perf/latency.jl @@ -0,0 +1,39 @@ +module Latency + +using CUDA +using BenchmarkTools + +function main() + results = BenchmarkGroup() + + base_cmd = Base.julia_cmd() + if Base.JLOptions().project != C_NULL + base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))` + end + # NOTE: we don't ust Base.active_project() here because of how CI launches this script, + # starting with --project in the main CUDA.jl project. + + # time to precompile the package and its dependencies + precompile_cmd = + `$base_cmd -e "pkg = Base.identify_package(\"CUDA\") + Base.compilecache(pkg)"` + results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60 + + # time to actually import the package + import_cmd = + `$base_cmd -e "using CUDA"` + results["import"] = @benchmark run($import_cmd) evals=1 seconds=30 + + # time to actually compile a kernel + ttfp_cmd = + `$base_cmd -e "using CUDA + kernel() = return + CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"` + results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60 + + results +end + +end + +Latency.main() diff --git a/perf/metal.jl b/perf/metal.jl new file mode 100644 index 000000000..5dbcbf0ab --- /dev/null +++ b/perf/metal.jl @@ -0,0 +1,14 @@ +group = addgroup!(SUITE, "cuda") + +let group = addgroup!(group, "synchronization") + let group = addgroup!(group, "stream") + group["blocking"] = @benchmarkable synchronize(blocking=true) + group["auto"] = @benchmarkable synchronize() + group["nonblocking"] = @benchmarkable synchronize(spin=false) + end + let group = addgroup!(group, "context") + group["blocking"] = @benchmarkable device_synchronize(blocking=true) + group["auto"] = @benchmarkable device_synchronize() + group["nonblocking"] = @benchmarkable device_synchronize(spin=false) + end +end diff --git a/perf/metaldevrt.jl b/perf/metaldevrt.jl new file mode 100644 index 000000000..08348fab5 --- /dev/null +++ b/perf/metaldevrt.jl @@ -0,0 +1,42 @@ +module cudadevrt + +using CUDA, BenchmarkTools, Random + +const threads = 256 +#simple add matrix and vector kernel +function kernel_add_mat_vec(m, x1, x2, y) + # one block per column + offset = (blockIdx().x-1) * m + @inbounds xtmp = x2[blockIdx().x] + for i = threadIdx().x : blockDim().x : m + @inbounds y[offset + i] = x1[offset + i] + xtmp + end + return +end + +function add!(y, x1, x2) + m, n = size(x1) + @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) +end + +function main() + Random.seed!(1) + m, n = 3072, 1536 # 256 multiplier + x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5)) + x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5)) + y1 = similar(x1) + + results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2) + + # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them + CUDA.unsafe_free!(x1) + CUDA.unsafe_free!(x2) + CUDA.unsafe_free!(y1) + + return results +end + +end + +cudadevrt.main() + diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl new file mode 100644 index 000000000..5969dcd9b --- /dev/null +++ b/perf/runbenchmarks.jl @@ -0,0 +1,128 @@ +# benchmark suite execution and codespeed submission + +using CUDA + +using BenchmarkTools + +using StableRNGs +rng = StableRNG(123) + +# we only submit results when running on the master branch +real_run = get(ENV, "CODESPEED_BRANCH", nothing) == "master" +if real_run + # to find untuned benchmarks + BenchmarkTools.DEFAULT_PARAMETERS.evals = 0 +end + +# convenience macro to create a benchmark that requires synchronizing the GPU +macro async_benchmarkable(ex...) + quote + @benchmarkable CUDA.@sync blocking=true $(ex...) + end +end + +# before anything else, run latency benchmarks. these spawn subprocesses, so we don't want +# to do so after regular benchmarks have caused the memory allocator to reserve memory. +@info "Running latency benchmarks" +latency_results = include("latency.jl") + +SUITE = BenchmarkGroup() + +# NOTE: don't use spaces in benchmark names (tobami/codespeed#256) + +include("cuda.jl") +include("kernel.jl") +include("array.jl") + +if real_run + @info "Preparing main benchmarks" + warmup(SUITE; verbose=false) + tune!(SUITE) + + # reclaim memory that might have been used by the tuning process + GC.gc(true) + CUDA.reclaim() +end + +# benchmark groups that aren't part of the suite +addgroup!(SUITE, "integration") + +@info "Running main benchmarks" +results = run(SUITE, verbose=true) + +# integration tests (that do nasty things, so need to be run last) +@info "Running integration benchmarks" +integration_results = BenchmarkGroup() +integration_results["volumerhs"] = include("volumerhs.jl") +integration_results["byval"] = include("byval.jl") +integration_results["cudadevrt"] = include("cudadevrt.jl") + +results["latency"] = latency_results +results["integration"] = integration_results + +println(results) + + +## comparison + +# write out the results +BenchmarkTools.save(joinpath(@__DIR__, "results.json"), results) + +# compare against previous results +# TODO: store these results so that we can compare when benchmarking PRs +reference_path = joinpath(@__DIR__, "reference.json") +if ispath(reference_path) + reference = BenchmarkTools.load(reference_path)[1] + comparison = judge(minimum(results), minimum(reference)) + + println("Improvements:") + println(improvements(comparison)) + + println("Regressions:") + println(regressions(comparison)) +end + + +## submission + +using JSON, HTTP + +if real_run + @info "Submitting to Codespeed..." + + basedata = Dict( + "branch" => ENV["CODESPEED_BRANCH"], + "commitid" => ENV["CODESPEED_COMMIT"], + "project" => ENV["CODESPEED_PROJECT"], + "environment" => ENV["CODESPEED_ENVIRONMENT"], + "executable" => ENV["CODESPEED_EXECUTABLE"] + ) + + # convert nested groups of benchmark to flat dictionaries of results + flat_results = [] + function flatten(results, prefix="") + for (key,value) in results + if value isa BenchmarkGroup + flatten(value, "$prefix$key/") + else + @assert value isa BenchmarkTools.Trial + + # codespeed reports maxima, but those are often very noisy. + # get rid of measurements that unnecessarily skew the distribution. + rmskew!(value) + + push!(flat_results, + Dict(basedata..., + "benchmark" => "$prefix$key", + "result_value" => median(value).time / 1e9, + "min" => minimum(value).time / 1e9, + "max" => maximum(value).time / 1e9)) + end + end + end + flatten(results) + + HTTP.post("$(ENV["CODESPEED_SERVER"])/result/add/json/", + ["Content-Type" => "application/x-www-form-urlencoded"], + HTTP.URIs.escapeuri(Dict("json" => JSON.json(flat_results)))) +end diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl new file mode 100644 index 000000000..5c7737f57 --- /dev/null +++ b/perf/volumerhs.jl @@ -0,0 +1,273 @@ +module VolumeRHS + +using BenchmarkTools +using CUDA +using StableRNGs +using StaticArrays + +function loopinfo(name, expr, nodes...) + if expr.head != :for + error("Syntax error: pragma $name needs a for loop") + end + push!(expr.args[2].args, Expr(:loopinfo, nodes...)) + return expr +end + +macro unroll(expr) + expr = loopinfo("@unroll", expr, (Symbol("llvm.loop.unroll.full"),)) + return esc(expr) +end + +# HACK: module-local versions of core arithmetic; needed to get FMA +for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub)) + for (T, llvmT) in ((:Float32, "float"), (:Float64, "double")) + ir = """ + %x = f$f contract nsz $llvmT %0, %1 + ret $llvmT %x + """ + @eval begin + # the @pure is necessary so that we can constant propagate. + @inline Base.@pure function $jlf(a::$T, b::$T) + Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) + end + end + end + @eval function $jlf(args...) + Base.$jlf(args...) + end +end + +let (jlf, f) = (:div_arcp, :div) + for (T, llvmT) in ((:Float32, "float"), (:Float64, "double")) + ir = """ + %x = f$f fast $llvmT %0, %1 + ret $llvmT %x + """ + @eval begin + # the @pure is necessary so that we can constant propagate. + @inline Base.@pure function $jlf(a::$T, b::$T) + Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) + end + end + end + @eval function $jlf(args...) + Base.$jlf(args...) + end +end +rcp(x) = div_arcp(one(x), x) # still leads to rcp.rn which is also a function call + +# div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) +# rcp(x) = div_fast(one(x), x) + +# note the order of the fields below is also assumed in the code. +const _nstate = 5 +const _ρ, _U, _V, _W, _E = 1:_nstate +const stateid = (ρ = _ρ, U = _U, V = _V, W = _W, E = _E) + +const _nvgeo = 14 +const _ξx, _ηx, _ζx, _ξy, _ηy, _ζy, _ξz, _ηz, _ζz, _MJ, _MJI, +_x, _y, _z = 1:_nvgeo +const vgeoid = (ξx = _ξx, ηx = _ηx, ζx = _ζx, + ξy = _ξy, ηy = _ηy, ζy = _ζy, + ξz = _ξz, ηz = _ηz, ζz = _ζz, + MJ = _MJ, MJI = _MJI, + x = _x, y = _y, z = _z) + +const N = 4 +const nmoist = 0 +const ntrace = 0 + +Base.@irrational grav 9.81 BigFloat(9.81) +Base.@irrational gdm1 0.4 BigFloat(0.4) + +function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) + Q = Base.Experimental.Const(Q) + vgeo = Base.Experimental.Const(vgeo) + D = Base.Experimental.Const(D) + + nvar = _nstate + nmoist + ntrace + + Nq = N + 1 + + s_D = CuStaticSharedArray(eltype(D), (Nq, Nq)) + s_F = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate)) + s_G = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate)) + + r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef) + r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef) + r_rhsV = MArray{Tuple{Nq}, eltype(rhs)}(undef) + r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef) + r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef) + + e = blockIdx().x + j = threadIdx().y + i = threadIdx().x + + @inbounds begin + for k in 1:Nq + r_rhsρ[k] = zero(eltype(rhs)) + r_rhsU[k] = zero(eltype(rhs)) + r_rhsV[k] = zero(eltype(rhs)) + r_rhsW[k] = zero(eltype(rhs)) + r_rhsE[k] = zero(eltype(rhs)) + end + + # fetch D into shared + s_D[i, j] = D[i, j] + @unroll for k in 1:Nq + sync_threads() + + # Load values will need into registers + MJ = vgeo[i, j, k, _MJ, e] + ξx, ξy, ξz = vgeo[i,j,k,_ξx,e], vgeo[i,j,k,_ξy,e], vgeo[i,j,k,_ξz,e] + ηx, ηy, ηz = vgeo[i,j,k,_ηx,e], vgeo[i,j,k,_ηy,e], vgeo[i,j,k,_ηz,e] + ζx, ζy, ζz = vgeo[i,j,k,_ζx,e], vgeo[i,j,k,_ζy,e], vgeo[i,j,k,_ζz,e] + z = vgeo[i,j,k,_z,e] + + U, V, W = Q[i, j, k, _U, e], Q[i, j, k, _V, e], Q[i, j, k, _W, e] + ρ, E = Q[i, j, k, _ρ, e], Q[i, j, k, _E, e] + + # GPU performance trick + # Allow optimizations to use the reciprocal of an argument rather than perform division. + # IEEE floating-point division is implemented as a function call + ρinv = rcp(ρ) + ρ2inv = rcp(2ρ) + # ρ2inv = 0.5f0 * pinv + + P = gdm1*(E - (U^2 + V^2 + W^2)*ρ2inv - ρ*gravity*z) + + fluxρ_x = U + fluxU_x = ρinv * U * U + P + fluxV_x = ρinv * U * V + fluxW_x = ρinv * U * W + fluxE_x = ρinv * U * (E + P) + + fluxρ_y = V + fluxU_y = ρinv * V * U + fluxV_y = ρinv * V * V + P + fluxW_y = ρinv * V * W + fluxE_y = ρinv * V * (E + P) + + fluxρ_z = W + fluxU_z = ρinv * W * U + fluxV_z = ρinv * W * V + fluxW_z = ρinv * W * W + P + fluxE_z = ρinv * W * (E + P) + + s_F[i, j, _ρ] = MJ * (ξx * fluxρ_x + ξy * fluxρ_y + ξz * fluxρ_z) + s_F[i, j, _U] = MJ * (ξx * fluxU_x + ξy * fluxU_y + ξz * fluxU_z) + s_F[i, j, _V] = MJ * (ξx * fluxV_x + ξy * fluxV_y + ξz * fluxV_z) + s_F[i, j, _W] = MJ * (ξx * fluxW_x + ξy * fluxW_y + ξz * fluxW_z) + s_F[i, j, _E] = MJ * (ξx * fluxE_x + ξy * fluxE_y + ξz * fluxE_z) + + s_G[i, j, _ρ] = MJ * (ηx * fluxρ_x + ηy * fluxρ_y + ηz * fluxρ_z) + s_G[i, j, _U] = MJ * (ηx * fluxU_x + ηy * fluxU_y + ηz * fluxU_z) + s_G[i, j, _V] = MJ * (ηx * fluxV_x + ηy * fluxV_y + ηz * fluxV_z) + s_G[i, j, _W] = MJ * (ηx * fluxW_x + ηy * fluxW_y + ηz * fluxW_z) + s_G[i, j, _E] = MJ * (ηx * fluxE_x + ηy * fluxE_y + ηz * fluxE_z) + + r_Hρ = MJ * (ζx * fluxρ_x + ζy * fluxρ_y + ζz * fluxρ_z) + r_HU = MJ * (ζx * fluxU_x + ζy * fluxU_y + ζz * fluxU_z) + r_HV = MJ * (ζx * fluxV_x + ζy * fluxV_y + ζz * fluxV_z) + r_HW = MJ * (ζx * fluxW_x + ζy * fluxW_y + ζz * fluxW_z) + r_HE = MJ * (ζx * fluxE_x + ζy * fluxE_y + ζz * fluxE_z) + + # one shared access per 10 flops + for n = 1:Nq + Dkn = s_D[k, n] + + r_rhsρ[n] += Dkn * r_Hρ + r_rhsU[n] += Dkn * r_HU + r_rhsV[n] += Dkn * r_HV + r_rhsW[n] += Dkn * r_HW + r_rhsE[n] += Dkn * r_HE + end + + r_rhsW[k] -= MJ * ρ * gravity + + sync_threads() + + # loop of ξ-grid lines + @unroll for n = 1:Nq + Dni = s_D[n, i] + Dnj = s_D[n, j] + + r_rhsρ[k] += Dni * s_F[n, j, _ρ] + r_rhsρ[k] += Dnj * s_G[i, n, _ρ] + + r_rhsU[k] += Dni * s_F[n, j, _U] + r_rhsU[k] += Dnj * s_G[i, n, _U] + + r_rhsV[k] += Dni * s_F[n, j, _V] + r_rhsV[k] += Dnj * s_G[i, n, _V] + + r_rhsW[k] += Dni * s_F[n, j, _W] + r_rhsW[k] += Dnj * s_G[i, n, _W] + + r_rhsE[k] += Dni * s_F[n, j, _E] + r_rhsE[k] += Dnj * s_G[i, n, _E] + end + end # k + + @unroll for k in 1:Nq + MJI = vgeo[i, j, k, _MJI, e] + + # Updates are a performance bottleneck + # primary source of stall_long_sb + rhs[i, j, k, _U, e] += MJI*r_rhsU[k] + rhs[i, j, k, _V, e] += MJI*r_rhsV[k] + rhs[i, j, k, _W, e] += MJI*r_rhsW[k] + rhs[i, j, k, _ρ, e] += MJI*r_rhsρ[k] + rhs[i, j, k, _E, e] += MJI*r_rhsE[k] + end + end + return +end + +function main() + DFloat = Float32 + nelem = 240_000 + + rng = StableRNG(123) + + Nq = N + 1 + nvar = _nstate + nmoist + ntrace + + Q = 1 .+ CuArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem)) + Q[:, :, :, _E, :] .+= 20 + vgeo = CuArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem)) + + # make sure the entries of the mass matrix satisfy the inverse relation + vgeo[:, :, :, _MJ, :] .+= 3 + vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :] + + D = CuArray(rand(rng, DFloat, Nq, Nq)) + + rhs = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) + + threads=(N+1, N+1) + + kernel = @cuda launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem) + # XXX: should we print these for all kernels? maybe upload them to Codespeed? + @info """volumerhs! details: + - $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads + - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory, + $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, + $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" + results = @benchmark begin + CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; + threads=$threads, blocks=$nelem) + end + + # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them + CUDA.unsafe_free!(rhs) + CUDA.unsafe_free!(Q) + CUDA.unsafe_free!(vgeo) + CUDA.unsafe_free!(D) + + results +end + +end + +VolumeRHS.main() From d2f63c8b109ec91bfed8056748a6b4ebf471eed0 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 17 Sep 2024 09:50:07 -0300 Subject: [PATCH 2/8] Add CI --- .buildkite/pipeline.yml | 132 +++++++++++++++++++++++++++++++- .github/workflows/Benchmark.yml | 63 +++++++++++++++ perf/Project.toml | 1 + perf/runbenchmarks.jl | 7 +- 4 files changed, 197 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/Benchmark.yml diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 3cc99e534..ccfdda4ff 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -19,7 +19,12 @@ steps: queue: "juliaecosystem" os: "macos" arch: "aarch64" - if: build.message !~ /\[skip tests\]/ + if: | + build.message =~ /\[only tests\]/ || + build.message =~ /\[only julia\]/ || + build.message !~ /\[only/ && !build.pull_request.draft && + build.message !~ /\[skip tests\]/ && + build.message !~ /\[skip julia\]/ timeout_in_minutes: 60 matrix: setup: @@ -46,7 +51,12 @@ steps: queue: "juliaecosystem" os: "macos" arch: "aarch64" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + if: | + build.message =~ /\[only tests\]/ || + build.message =~ /\[only special\]/ || + build.message !~ /\[only/ && !build.pull_request.draft && + build.message !~ /\[skip tests\]/ && + build.message !~ /\[skip special\]/ timeout_in_minutes: 60 matrix: setup: @@ -75,7 +85,12 @@ steps: queue: "juliaecosystem" os: "macos" arch: "aarch64" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + if: | + build.message =~ /\[only tests\]/ || + build.message =~ /\[only special\]/ || + build.message !~ /\[only/ && !build.pull_request.draft && + build.message !~ /\[skip tests\]/ && + build.message !~ /\[skip special\]/ timeout_in_minutes: 60 - label: "Opaque pointers" plugins: @@ -95,5 +110,114 @@ steps: queue: "juliaecosystem" os: "macos" arch: "aarch64" - if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + if: | + build.message =~ /\[only tests\]/ || + build.message =~ /\[only special\]/ || + build.message !~ /\[only/ && !build.pull_request.draft && + build.message !~ /\[skip tests\]/ && + build.message !~ /\[skip special\]/ timeout_in_minutes: 60 + + # we want to benchmark every commit on the master branch, even if it failed CI + - wait: ~ + # continue_on_failure: true + + - group: ":racehorse: Benchmarks" + steps: + - label: "Benchmarks" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia --project=perf -e ' + using Pkg + + println("--- :julia: Instantiating project") + Pkg.develop([PackageSpec(path=pwd())]) + Pkg.instantiate() + push!(LOAD_PATH, @__DIR__) + + println("+++ :julia: Benchmarking") + include("perf/runbenchmarks.jl")' + artifact_paths: + - "benchmarkresults.json" + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + if: | + build.message =~ /\[only benchmarks\]/ || + build.message !~ /\[only/ && !build.pull_request.draft && + build.message !~ /\[skip benchmarks\]/ + timeout_in_minutes: 30 + # - label: "Benchmarks (dry run)" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.10" + # command: | + # julia --project -e ' + # using Pkg + + # println("--- :julia: Instantiating project") + # Pkg.resolve() + # Pkg.instantiate() + # Pkg.activate("perf") + # Pkg.resolve() + # Pkg.instantiate() + # push!(LOAD_PATH, @__DIR__) + + # println("+++ :julia: Benchmarking") + # include("perf/runbenchmarks.jl")' + # artifact_paths: + # - "results.json" + # agents: + # queue: "juliaecosystem" + # os: "macos" + # arch: "aarch64" + # if: | + # build.message =~ /\[only benchmarks\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip benchmarks\]/ + # timeout_in_minutes: 30 + + # if we will submit results, use the benchmark queue so that we will + # be running on the same system each time + # - label: "Benchmarks on Julia {{matrix.julia}}" + # plugins: + # - JuliaCI/julia#v1: + # version: "{{matrix.julia}}" + # env: + # BENCHMARKS: "true" + # CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME" + # CODESPEED_BRANCH: "$BUILDKITE_BRANCH" + # CODESPEED_COMMIT: "$BUILDKITE_COMMIT" + # CODESPEED_EXECUTABLE: "Julia {{matrix.julia}}" + # command: | + # julia --project -e ' + # using Pkg + # ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] + + # println("--- :julia: Instantiating project") + # Pkg.resolve() + # Pkg.instantiate() + # Pkg.activate("perf") + # Pkg.resolve() + # Pkg.instantiate() + # push!(LOAD_PATH, @__DIR__) + + # println("+++ :julia: Benchmarking") + # include("perf/runbenchmarks.jl")' + # agents: + # queue: "benchmark" + # gpu: "rtx2070" + # cuda: "*" + # if: | + # build.branch =~ /^master$$/ && build.message =~ /\[only benchmarks\]/ || + # build.branch =~ /^master$$/ && build.message !~ /\[only/ && + # build.message !~ /\[skip benchmarks\]/ + # matrix: + # setup: + # julia: + # - "1.10" + # - "1.11" + # timeout_in_minutes: 30 diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml new file mode 100644 index 000000000..9c43a8417 --- /dev/null +++ b/.github/workflows/Benchmark.yml @@ -0,0 +1,63 @@ +name: Benchmarks +permissions: + contents: write # contents permission to update benchmark contents in gh-pages branch + statuses: read + deployments: write # deployments permission to deploy GitHub pages website + pull-requests: write + +on: + pull_request: + branches: + - main + paths: + - "src/**/*" + - "ext/**/*" + - "perf/**/*" + - ".buildkite/**/*" + - "Project.toml" + - ".github/workflows/Benchmark.yml" + push: + branches: + - main + paths: + - "src/**/*" + - "ext/**/*" + - "benchmarks/**/*" + - ".buildkite/**/*" + - "Project.toml" + - ".github/workflows/Benchmark.yml" + +jobs: + benchmark: + if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Download Buildkite Artifacts + id: download + uses: EnricoMi/download-buildkite-artifact-action@v1 + with: + buildkite_token: ${{ secrets.BUILDKITE_TOKEN }} + ignore_build_states: blocked,canceled,skipped,not_run,failed + ignore_job_states: timed_out,failed + output_path: artifacts + + - name: Locate Benchmarks Artifact + id: locate + if: ${{ steps.download.outputs.download-state == 'success' }} + run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT + + - name: Upload Benchmark Results + if: ${{ steps.locate.outputs.path != '' }} + uses: benchmark-action/github-action-benchmark@v1 + with: + name: Metal Benchmarks + tool: "julia" + output-file-path: ${{ steps.locate.outputs.path }} + benchmark-data-dir-path: "" + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-always: true + summary-always: true + alert-threshold: "150%" + fail-on-alert: false + auto-push: ${{ github.event_name != 'pull_request' }} diff --git a/perf/Project.toml b/perf/Project.toml index 8314c7b28..decfbe75f 100644 --- a/perf/Project.toml +++ b/perf/Project.toml @@ -2,5 +2,6 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index 5969dcd9b..e3c6c6e9f 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -14,6 +14,9 @@ if real_run BenchmarkTools.DEFAULT_PARAMETERS.evals = 0 end +# print system information +@info "System information:\n" * sprint(io->Metal.versioninfo(io)) + # convenience macro to create a benchmark that requires synchronizing the GPU macro async_benchmarkable(ex...) quote @@ -66,7 +69,7 @@ println(results) ## comparison # write out the results -BenchmarkTools.save(joinpath(@__DIR__, "results.json"), results) +BenchmarkTools.save("benchmarkresults.json", results) # compare against previous results # TODO: store these results so that we can compare when benchmarking PRs @@ -85,7 +88,7 @@ end ## submission -using JSON, HTTP +# using JSON, HTTP if real_run @info "Submitting to Codespeed..." From 9175ab8254ec2e4f9e4b9e6616d14e661a734f94 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 17 Sep 2024 10:26:56 -0300 Subject: [PATCH 3/8] Adapt for Metal [only benchmarks] --- perf/array.jl | 48 ++++++++++++------------ perf/byval.jl | 33 ++++++++--------- perf/kernel.jl | 36 +++++++++--------- perf/latency.jl | 12 +++--- perf/metal.jl | 14 ++----- perf/metaldevrt.jl | 26 ++++++------- perf/runbenchmarks.jl | 78 +++++++-------------------------------- perf/volumerhs.jl | 86 ++++++++++++++++++++++--------------------- 8 files changed, 139 insertions(+), 194 deletions(-) diff --git a/perf/array.jl b/perf/array.jl index a6a226ccf..0c57a7dfa 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -5,16 +5,16 @@ const n = 1000 # generate some arrays cpu_mat = rand(rng, Float32, m, n) -gpu_mat = CuArray{Float32}(undef, size(cpu_mat)) +gpu_mat = MtlArray{Float32}(undef, size(cpu_mat)) gpu_vec = reshape(gpu_mat, length(gpu_mat)) gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) -gpu_mat_ints = CuArray(rand(rng, Int, m, n)) +gpu_mat_ints = MtlArray(rand(rng, Int, m, n)) gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) -gpu_mat_bools = CuArray(rand(rng, Bool, m, n)) +gpu_mat_bools = MtlArray(rand(rng, Bool, m, n)) gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) -group["construct"] = @benchmarkable CuArray{Int}(undef, 1) +group["construct"] = @benchmarkable MtlArray{Int}(undef, 1) group["copy"] = @async_benchmarkable copy($gpu_mat) @@ -26,7 +26,7 @@ let group = addgroup!(group, "copyto!") end let group = addgroup!(group, "iteration") - group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10] + group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10] group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] @@ -46,12 +46,12 @@ let group = addgroup!(group, "iteration") end end -let group = addgroup!(group, "reverse") - group["1d"] = @async_benchmarkable reverse($gpu_vec) - group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) - group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) - group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) -end +# let group = addgroup!(group, "reverse") +# group["1d"] = @async_benchmarkable reverse($gpu_vec) +# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) +# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) +# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) +# end group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 @@ -77,31 +77,31 @@ end let group = addgroup!(group, "random") let group = addgroup!(group, "rand") - group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n) - group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n) + group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n) + group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n) end let group = addgroup!(group, "rand!") - group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec) - group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints) + group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec) + group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints) end let group = addgroup!(group, "randn") - group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n) - #group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n) + group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n) + # group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n) end let group = addgroup!(group, "randn!") - group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec) - #group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints) + group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec) + # group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints) end end -let group = addgroup!(group, "sorting") - group["1d"] = @async_benchmarkable sort($gpu_vec) - group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) - group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) -end +# let group = addgroup!(group, "sorting") +# group["1d"] = @async_benchmarkable sort($gpu_vec) +# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) +# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) +# end let group = addgroup!(group, "permutedims") group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1)) diff --git a/perf/byval.jl b/perf/byval.jl index d32d62a9c..6a5466418 100644 --- a/perf/byval.jl +++ b/perf/byval.jl @@ -1,13 +1,12 @@ module ByVal -using CUDA, BenchmarkTools, Random -using CUDA: i32 +using Metal, BenchmarkTools, Random const threads = 256 # simple add matrixes kernel function kernel_add_mat(n, x1, x2, y) - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + i = thread_position_in_grid_1d() if i <= n @inbounds y[i] = x1[i] + x2[i] end @@ -20,8 +19,8 @@ end # add arrays of matrixes kernel function kernel_add_mat_z_slices(n, vararg...) - x1, x2, y = get_inputs3(blockIdx().y, vararg...) - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...) + i = thread_position_in_grid_1d() if i <= n @inbounds y[i] = x1[i] + x2[i] end @@ -30,15 +29,15 @@ end function add_z_slices!(y, x1, x2) m1, n1 = size(x1[1]) #get size of first slice - blocks = (m1 * n1 + threads - 1) ÷ threads - # get length(x1) more blocks than needed to process 1 slice - @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) + groups = (m1 * n1 + threads - 1) ÷ threads + # get length(x1) more groups than needed to process 1 slice + @metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) end function add!(y, x1, x2) m1, n1 = size(x1) - blocks = (m1 * n1 + threads - 1) ÷ threads - @cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y) + groups = (m1 * n1 + threads - 1) ÷ threads + @metal groups = (groups, 1) threads = threads kernel_add_mat(m1 * n1, x1, x2, y) end function main() @@ -54,22 +53,22 @@ function main() m, n = 3072, 1536 # 256 multiplier #m, n = 6007, 3001 # prime numbers to test memory access correctness - x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] - x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] + x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] + x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] y1 = [similar(x1[1]) for i = 1:num_z_slices] # reference down to bones add on GPU - results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1]) + results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1]) # adding arrays in an array for slices = 1:num_z_slices - results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) + results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) end # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them - CUDA.unsafe_free!.(x1) - CUDA.unsafe_free!.(x2) - CUDA.unsafe_free!.(y1) + Metal.unsafe_free!.(x1) + Metal.unsafe_free!.(x2) + Metal.unsafe_free!.(y1) return results end diff --git a/perf/kernel.jl b/perf/kernel.jl index 1022a774a..5cfcc9242 100644 --- a/perf/kernel.jl +++ b/perf/kernel.jl @@ -1,33 +1,35 @@ -using CUDA: i32 +# using GPUArrays group = addgroup!(SUITE, "kernel") -group["launch"] = @benchmarkable @cuda identity(nothing) +group["launch"] = @benchmarkable @metal identity(nothing) -group["occupancy"] = @benchmarkable begin - kernel = @cuda launch=false identity(nothing) - launch_configuration(kernel.fun) -end +# group["occupancy"] = @benchmarkable begin +# kernel = @metal launch=false identity(nothing) +# GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1) +# return +# end -src = CUDA.rand(Float32, 512, 1000) +src = Metal.rand(Float32, 512, 1000) dest = similar(src) function indexing_kernel(dest, src) - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + i = thread_position_in_grid_1d() @inbounds dest[i] = src[i] return end -group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src) +group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src) function checked_indexing_kernel(dest, src) - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + i = thread_position_in_grid_1d() dest[i] = src[i] return end -group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src) +group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src) -function rand_kernel(dest::AbstractArray{T}) where {T} - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x - dest[i] = rand(T) - return -end -group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest) +## DELETE +# function rand_kernel(dest::AbstractArray{T}) where {T} +# i = thread_position_in_grid_1d() +# dest[i] = Metal.rand(T) +# return +# end +# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest) diff --git a/perf/latency.jl b/perf/latency.jl index 9cd201353..a1066abe8 100644 --- a/perf/latency.jl +++ b/perf/latency.jl @@ -1,6 +1,6 @@ module Latency -using CUDA +using Metal using BenchmarkTools function main() @@ -11,24 +11,24 @@ function main() base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))` end # NOTE: we don't ust Base.active_project() here because of how CI launches this script, - # starting with --project in the main CUDA.jl project. + # starting with --project in the main Metal.jl project. # time to precompile the package and its dependencies precompile_cmd = - `$base_cmd -e "pkg = Base.identify_package(\"CUDA\") + `$base_cmd -e "pkg = Base.identify_package(\"Metal\") Base.compilecache(pkg)"` results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60 # time to actually import the package import_cmd = - `$base_cmd -e "using CUDA"` + `$base_cmd -e "using Metal"` results["import"] = @benchmark run($import_cmd) evals=1 seconds=30 # time to actually compile a kernel ttfp_cmd = - `$base_cmd -e "using CUDA + `$base_cmd -e "using Metal kernel() = return - CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"` + Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"` results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60 results diff --git a/perf/metal.jl b/perf/metal.jl index 5dbcbf0ab..5e136b0ed 100644 --- a/perf/metal.jl +++ b/perf/metal.jl @@ -1,14 +1,6 @@ -group = addgroup!(SUITE, "cuda") +group = addgroup!(SUITE, "metal") let group = addgroup!(group, "synchronization") - let group = addgroup!(group, "stream") - group["blocking"] = @benchmarkable synchronize(blocking=true) - group["auto"] = @benchmarkable synchronize() - group["nonblocking"] = @benchmarkable synchronize(spin=false) - end - let group = addgroup!(group, "context") - group["blocking"] = @benchmarkable device_synchronize(blocking=true) - group["auto"] = @benchmarkable device_synchronize() - group["nonblocking"] = @benchmarkable device_synchronize(spin=false) - end + group["stream"] = @benchmarkable synchronize() + group["context"] = @benchmarkable device_synchronize() end diff --git a/perf/metaldevrt.jl b/perf/metaldevrt.jl index 08348fab5..a3dbd07fc 100644 --- a/perf/metaldevrt.jl +++ b/perf/metaldevrt.jl @@ -1,14 +1,14 @@ -module cudadevrt +module metaldevrt -using CUDA, BenchmarkTools, Random +using Metal, BenchmarkTools, Random const threads = 256 #simple add matrix and vector kernel function kernel_add_mat_vec(m, x1, x2, y) # one block per column - offset = (blockIdx().x-1) * m - @inbounds xtmp = x2[blockIdx().x] - for i = threadIdx().x : blockDim().x : m + offset = (threadgroup_position_in_grid_2d().x-1) * m + @inbounds xtmp = x2[threadgroup_position_in_grid_2d().x] + for i = thread_position_in_threadgroup_2d().x : threadgroups_per_grid_2d().x : m @inbounds y[offset + i] = x1[offset + i] + xtmp end return @@ -16,27 +16,27 @@ end function add!(y, x1, x2) m, n = size(x1) - @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) + @metal groups = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) end function main() Random.seed!(1) m, n = 3072, 1536 # 256 multiplier - x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5)) - x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5)) + x1 = mtl(randn(Float32, (m, n)) .+ Float32(0.5)) + x2 = mtl(randn(Float32, (1, n)) .+ Float32(0.5)) y1 = similar(x1) - results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2) + results = @benchmark Metal.@sync add!($y1, $x1, $x2) # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them - CUDA.unsafe_free!(x1) - CUDA.unsafe_free!(x2) - CUDA.unsafe_free!(y1) + Metal.unsafe_free!(x1) + Metal.unsafe_free!(x2) + Metal.unsafe_free!(y1) return results end end -cudadevrt.main() +metaldevrt.main() diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index e3c6c6e9f..549b01c9c 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -1,18 +1,14 @@ # benchmark suite execution and codespeed submission -using CUDA +using Metal using BenchmarkTools using StableRNGs rng = StableRNG(123) -# we only submit results when running on the master branch -real_run = get(ENV, "CODESPEED_BRANCH", nothing) == "master" -if real_run - # to find untuned benchmarks - BenchmarkTools.DEFAULT_PARAMETERS.evals = 0 -end +# to find untuned benchmarks +BenchmarkTools.DEFAULT_PARAMETERS.evals = 0 # print system information @info "System information:\n" * sprint(io->Metal.versioninfo(io)) @@ -20,7 +16,7 @@ end # convenience macro to create a benchmark that requires synchronizing the GPU macro async_benchmarkable(ex...) quote - @benchmarkable CUDA.@sync blocking=true $(ex...) + @benchmarkable Metal.@sync $(ex...) end end @@ -33,19 +29,18 @@ SUITE = BenchmarkGroup() # NOTE: don't use spaces in benchmark names (tobami/codespeed#256) -include("cuda.jl") +include("metal.jl") include("kernel.jl") include("array.jl") -if real_run - @info "Preparing main benchmarks" - warmup(SUITE; verbose=false) - tune!(SUITE) +@info "Preparing main benchmarks" +warmup(SUITE; verbose=false) +tune!(SUITE) - # reclaim memory that might have been used by the tuning process - GC.gc(true) - CUDA.reclaim() -end +# reclaim memory that might have been used by the tuning process +GC.gc(true) +GC.gc(true) +GC.gc(true) # benchmark groups that aren't part of the suite addgroup!(SUITE, "integration") @@ -56,9 +51,9 @@ results = run(SUITE, verbose=true) # integration tests (that do nasty things, so need to be run last) @info "Running integration benchmarks" integration_results = BenchmarkGroup() -integration_results["volumerhs"] = include("volumerhs.jl") +# integration_results["volumerhs"] = include("volumerhs.jl") integration_results["byval"] = include("byval.jl") -integration_results["cudadevrt"] = include("cudadevrt.jl") +integration_results["metaldevrt"] = include("metaldevrt.jl") results["latency"] = latency_results results["integration"] = integration_results @@ -84,48 +79,3 @@ if ispath(reference_path) println("Regressions:") println(regressions(comparison)) end - - -## submission - -# using JSON, HTTP - -if real_run - @info "Submitting to Codespeed..." - - basedata = Dict( - "branch" => ENV["CODESPEED_BRANCH"], - "commitid" => ENV["CODESPEED_COMMIT"], - "project" => ENV["CODESPEED_PROJECT"], - "environment" => ENV["CODESPEED_ENVIRONMENT"], - "executable" => ENV["CODESPEED_EXECUTABLE"] - ) - - # convert nested groups of benchmark to flat dictionaries of results - flat_results = [] - function flatten(results, prefix="") - for (key,value) in results - if value isa BenchmarkGroup - flatten(value, "$prefix$key/") - else - @assert value isa BenchmarkTools.Trial - - # codespeed reports maxima, but those are often very noisy. - # get rid of measurements that unnecessarily skew the distribution. - rmskew!(value) - - push!(flat_results, - Dict(basedata..., - "benchmark" => "$prefix$key", - "result_value" => median(value).time / 1e9, - "min" => minimum(value).time / 1e9, - "max" => maximum(value).time / 1e9)) - end - end - end - flatten(results) - - HTTP.post("$(ENV["CODESPEED_SERVER"])/result/add/json/", - ["Content-Type" => "application/x-www-form-urlencoded"], - HTTP.URIs.escapeuri(Dict("json" => JSON.json(flat_results)))) -end diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl index 5c7737f57..f5b3b6d35 100644 --- a/perf/volumerhs.jl +++ b/perf/volumerhs.jl @@ -1,7 +1,7 @@ module VolumeRHS using BenchmarkTools -using CUDA +using Metal using StableRNGs using StaticArrays @@ -20,16 +20,16 @@ end # HACK: module-local versions of core arithmetic; needed to get FMA for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub)) - for (T, llvmT) in ((:Float32, "float"), (:Float64, "double")) - ir = """ - %x = f$f contract nsz $llvmT %0, %1 - ret $llvmT %x - """ - @eval begin - # the @pure is necessary so that we can constant propagate. - @inline Base.@pure function $jlf(a::$T, b::$T) - Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) - end + T = :Float32 + llvmT = "float" + ir = """ + %x = f$f contract nsz $llvmT %0, %1 + ret $llvmT %x + """ + @eval begin + # the @pure is necessary so that we can constant propagate. + @inline Base.@pure function $jlf(a::$T, b::$T) + Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) end end @eval function $jlf(args...) @@ -38,16 +38,16 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub)) end let (jlf, f) = (:div_arcp, :div) - for (T, llvmT) in ((:Float32, "float"), (:Float64, "double")) - ir = """ - %x = f$f fast $llvmT %0, %1 - ret $llvmT %x - """ - @eval begin - # the @pure is necessary so that we can constant propagate. - @inline Base.@pure function $jlf(a::$T, b::$T) - Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) - end + T = :Float32 + llvmT = "float" + ir = """ + %x = f$f fast $llvmT %0, %1 + ret $llvmT %x + """ + @eval begin + # the @pure is necessary so that we can constant propagate. + @inline Base.@pure function $jlf(a::$T, b::$T) + Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) end end @eval function $jlf(args...) @@ -77,8 +77,10 @@ const N = 4 const nmoist = 0 const ntrace = 0 -Base.@irrational grav 9.81 BigFloat(9.81) -Base.@irrational gdm1 0.4 BigFloat(0.4) +# Base.@irrational grav 9.81 BigFloat(9.81) +const grav = 9.81f0 +# Base.@irrational gdm1 0.4 BigFloat(0.4) +const gdm1 = 0.4f0 function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) Q = Base.Experimental.Const(Q) @@ -99,9 +101,9 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef) r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef) - e = blockIdx().x - j = threadIdx().y - i = threadIdx().x + e = threadgroup_position_in_grid_2d().x + j = thread_position_in_threadgroup_2d().y + i = thread_position_in_threadgroup_2d().x @inbounds begin for k in 1:Nq @@ -233,37 +235,37 @@ function main() Nq = N + 1 nvar = _nstate + nmoist + ntrace - Q = 1 .+ CuArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem)) + Q = 1 .+ MtlArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem)) Q[:, :, :, _E, :] .+= 20 - vgeo = CuArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem)) + vgeo = MtlArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem)) # make sure the entries of the mass matrix satisfy the inverse relation vgeo[:, :, :, _MJ, :] .+= 3 vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :] - D = CuArray(rand(rng, DFloat, Nq, Nq)) + D = MtlArray(rand(rng, DFloat, Nq, Nq)) - rhs = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) + rhs = MtlArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) threads=(N+1, N+1) - kernel = @cuda launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem) + kernel = @metal launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem) # XXX: should we print these for all kernels? maybe upload them to Codespeed? - @info """volumerhs! details: - - $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads - - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory, - $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, - $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" + # @info """volumerhs! details: + # - $(Metal.registers(kernel)) registers, max $(Metal.maxthreads(kernel)) threads + # - $(Base.format_bytes(Metal.memory(kernel).local)) local memory, + # $(Base.format_bytes(Metal.memory(kernel).shared)) shared memory, + # $(Base.format_bytes(Metal.memory(kernel).constant)) constant memory""" results = @benchmark begin - CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; - threads=$threads, blocks=$nelem) + Metal.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; + threads=$threads, groups=$nelem) end # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them - CUDA.unsafe_free!(rhs) - CUDA.unsafe_free!(Q) - CUDA.unsafe_free!(vgeo) - CUDA.unsafe_free!(D) + Metal.unsafe_free!(rhs) + Metal.unsafe_free!(Q) + Metal.unsafe_free!(vgeo) + Metal.unsafe_free!(D) results end From cc8de561c65d96a9a617fbc4cbf15c82a1f81538 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 19 Sep 2024 10:08:49 -0300 Subject: [PATCH 4/8] Save median of results [only benchmarks] --- perf/runbenchmarks.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index 549b01c9c..49078a643 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -64,7 +64,7 @@ println(results) ## comparison # write out the results -BenchmarkTools.save("benchmarkresults.json", results) +BenchmarkTools.save("benchmarkresults.json", median(results)) # compare against previous results # TODO: store these results so that we can compare when benchmarking PRs From a448039110778aab9107fd817560648d78386d70 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 19 Sep 2024 10:32:24 -0300 Subject: [PATCH 5/8] Cleanup --- .buildkite/pipeline.yml | 71 ----------------------------------------- 1 file changed, 71 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index ccfdda4ff..9dbcda8f8 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -150,74 +150,3 @@ steps: build.message !~ /\[only/ && !build.pull_request.draft && build.message !~ /\[skip benchmarks\]/ timeout_in_minutes: 30 - # - label: "Benchmarks (dry run)" - # plugins: - # - JuliaCI/julia#v1: - # version: "1.10" - # command: | - # julia --project -e ' - # using Pkg - - # println("--- :julia: Instantiating project") - # Pkg.resolve() - # Pkg.instantiate() - # Pkg.activate("perf") - # Pkg.resolve() - # Pkg.instantiate() - # push!(LOAD_PATH, @__DIR__) - - # println("+++ :julia: Benchmarking") - # include("perf/runbenchmarks.jl")' - # artifact_paths: - # - "results.json" - # agents: - # queue: "juliaecosystem" - # os: "macos" - # arch: "aarch64" - # if: | - # build.message =~ /\[only benchmarks\]/ || - # build.message !~ /\[only/ && !build.pull_request.draft && - # build.message !~ /\[skip benchmarks\]/ - # timeout_in_minutes: 30 - - # if we will submit results, use the benchmark queue so that we will - # be running on the same system each time - # - label: "Benchmarks on Julia {{matrix.julia}}" - # plugins: - # - JuliaCI/julia#v1: - # version: "{{matrix.julia}}" - # env: - # BENCHMARKS: "true" - # CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME" - # CODESPEED_BRANCH: "$BUILDKITE_BRANCH" - # CODESPEED_COMMIT: "$BUILDKITE_COMMIT" - # CODESPEED_EXECUTABLE: "Julia {{matrix.julia}}" - # command: | - # julia --project -e ' - # using Pkg - # ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] - - # println("--- :julia: Instantiating project") - # Pkg.resolve() - # Pkg.instantiate() - # Pkg.activate("perf") - # Pkg.resolve() - # Pkg.instantiate() - # push!(LOAD_PATH, @__DIR__) - - # println("+++ :julia: Benchmarking") - # include("perf/runbenchmarks.jl")' - # agents: - # queue: "benchmark" - # gpu: "rtx2070" - # cuda: "*" - # if: | - # build.branch =~ /^master$$/ && build.message =~ /\[only benchmarks\]/ || - # build.branch =~ /^master$$/ && build.message !~ /\[only/ && - # build.message !~ /\[skip benchmarks\]/ - # matrix: - # setup: - # julia: - # - "1.10" - # - "1.11" - # timeout_in_minutes: 30 From 57f004f2d6f6b5b5d191f8158aa06575627ba75e Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 19 Sep 2024 10:41:19 -0300 Subject: [PATCH 6/8] Don't disable main tests when a PR is a draft --- .buildkite/pipeline.yml | 2 +- .github/workflows/Benchmark.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 9dbcda8f8..45471a42b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -22,7 +22,7 @@ steps: if: | build.message =~ /\[only tests\]/ || build.message =~ /\[only julia\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && + build.message !~ /\[only/ && build.message !~ /\[skip tests\]/ && build.message !~ /\[skip julia\]/ timeout_in_minutes: 60 diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml index 9c43a8417..1904c1640 100644 --- a/.github/workflows/Benchmark.yml +++ b/.github/workflows/Benchmark.yml @@ -29,7 +29,7 @@ on: jobs: benchmark: - if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }} + if: ${{ contains(github.event.head_commit.message, '[only benchmarks]') || !contains(github.event.head_commit.message, '[only') && !contains(github.event.head_commit.message, '[skip benchmarks]') && github.event.pull_request.draft == false }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 From c3a6fb37f267ed3a81218c65ffe32e68e32586fb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 24 Sep 2024 01:13:47 -0300 Subject: [PATCH 7/8] Benchmark on macOS 15 to catch potential performance impacts of new features. --- .buildkite/pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 45471a42b..fef716300 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -142,9 +142,10 @@ steps: artifact_paths: - "benchmarkresults.json" agents: - queue: "juliaecosystem" + queue: "julia" os: "macos" arch: "aarch64" + macos_version: "15.0" if: | build.message =~ /\[only benchmarks\]/ || build.message !~ /\[only/ && !build.pull_request.draft && From cfd94990af685a94f90514a1592b5e7226edc47e Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 24 Sep 2024 08:26:23 -0300 Subject: [PATCH 8/8] Use juliaecosystem runner for macOS 15 --- .buildkite/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index fef716300..b35a38233 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -142,7 +142,7 @@ steps: artifact_paths: - "benchmarkresults.json" agents: - queue: "julia" + queue: "juliaecosystem" os: "macos" arch: "aarch64" macos_version: "15.0"