From d65b63da10dcca8a0f3e7a5c9d9568829644542a Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 20 Oct 2025 19:03:42 +0200 Subject: [PATCH 01/22] Mark all the places we need to support NewPM --- src/compiler.jl | 20 ++++++++++++++------ src/compiler/optimize.jl | 28 ++++++++++++++++++---------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/compiler.jl b/src/compiler.jl index 7bcee309a1..35a924f42a 100644 --- a/src/compiler.jl +++ b/src/compiler.jl @@ -2752,9 +2752,13 @@ function enzyme!( for f in collect(functions(mod)) API.EnzymeFixupBatchedJuliaCallingConvention(f) end - ModulePassManager() do pm - dce!(pm) - LLVM.run!(pm, mod) + if LLVM.has_oldpm() + ModulePassManager() do pm + dce!(pm) + LLVM.run!(pm, mod) + end + else + # TODO(NewPM) end fix_decayaddr!(mod) adjointf = adjointf == nothing ? nothing : functions(mod)[adjointfname] @@ -5164,9 +5168,13 @@ end push!(toremove, name(f)) end end - ModulePassManager() do pm - always_inliner!(pm) - LLVM.run!(pm, mod) + if LLVM.has_oldpm() + ModulePassManager() do pm + always_inliner!(pm) + LLVM.run!(pm, mod) + end + else + # TODO(NewPM) end for fname in toremove if haskey(functions(mod), fname) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index bf73b9f955..7c9e51fae5 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -450,6 +450,10 @@ cse!(pm) = LLVM.API.LLVMAddEarlyCSEPass(pm) function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) addr13NoAlias(mod) + if !LLVM.has_oldpm() + # TODO(NewPM) + return + end # everying except unroll, slpvec, loop-vec # then finish Julia GC ModulePassManager() do pm @@ -750,19 +754,23 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool ), ) end - LLVM.ModulePassManager() do pm - addTargetPasses!(pm, tm, LLVM.triple(mod)) - addOptimizationPasses!(pm, tm) - LLVM.run!(pm, mod) - end - if machine - # TODO enable validate_return_roots - # validate_return_roots!(mod) + if LLVM.has_oldpm() LLVM.ModulePassManager() do pm - addJuliaLegalizationPasses!(pm, tm, true) - addMachinePasses!(pm, tm) + addTargetPasses!(pm, tm, LLVM.triple(mod)) + addOptimizationPasses!(pm, tm) LLVM.run!(pm, mod) end + if machine + # TODO enable validate_return_roots + # validate_return_roots!(mod) + LLVM.ModulePassManager() do pm + addJuliaLegalizationPasses!(pm, tm, true) + addMachinePasses!(pm, tm) + LLVM.run!(pm, mod) + end + end + else + # TODO(NewPM) end for f in functions(mod) if isempty(blocks(f)) From c072150588ba49e67c6db90143baf7635d4f32d9 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 20 Oct 2025 21:05:14 +0200 Subject: [PATCH 02/22] fix two oldpm invocations --- src/compiler.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/compiler.jl b/src/compiler.jl index 35a924f42a..26c37d6f6e 100644 --- a/src/compiler.jl +++ b/src/compiler.jl @@ -2758,7 +2758,7 @@ function enzyme!( LLVM.run!(pm, mod) end else - # TODO(NewPM) + run!(DCEPass(), mod) end fix_decayaddr!(mod) adjointf = adjointf == nothing ? nothing : functions(mod)[adjointfname] @@ -5170,11 +5170,11 @@ end end if LLVM.has_oldpm() ModulePassManager() do pm - always_inliner!(pm) - LLVM.run!(pm, mod) + always_inliner!(pm) + LLVM.run!(pm, mod) end else - # TODO(NewPM) + run!(AlwaysInlinerPass(), mod) end for fname in toremove if haskey(functions(mod), fname) From a88441b8af854511c5dd64f26314513f71fd372e Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 22 Oct 2025 18:25:13 +0200 Subject: [PATCH 03/22] WIP: translate finalization to NewPM --- src/compiler/optimize.jl | 67 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 7c9e51fae5..1c3b272bdd 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -687,6 +687,17 @@ function addMachinePasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) gvn!(pm) end +function addMachinePasses_newPM!(mpm::LLVM.NewPMPassManager) + add!(mpm, NewPMFunctionPassManager()) do fpm + if VERSION < v"1.12.0-DEV.1390" + add!(fpm, CombineMulAddPass()) + end + add!(fpm, DivRemPairsPass()) + add!(fpm, DemoteFloat16Pass()) + add!(fpm, GVNPass()) + end +end + function addJuliaLegalizationPasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, lower_intrinsics::Bool = true) if lower_intrinsics # LowerPTLS removes an indirect call. As a result, it is likely to trigger @@ -722,6 +733,46 @@ function addJuliaLegalizationPasses!(pm::LLVM.ModulePassManager, tm::LLVM.Target end end +ReinsertGCMarkerPass() = NewPMFunctionPass("reinsert_gcmarker", reinsert_gcmarker_pass!) + +function addJuliaLegalizationPasses_newPM!(mpm::LLVM.NewPMPassManager, lower_intrinsics::Bool = true) + if lower_intrinsics + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, ReinsertGCMarkerPass()) + if VERSION < v"1.13.0-DEV.36" + add!(fpm, LowerExcHandlersPass()) + end + # TODO: strong=false? + add!(fpm, GCInvariantVerifierPass()) + end + add!(mpm, VerifierPass()) + add!(mpm, RemoveNIPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, LateLowerGCPass()) + if VERSION >= v"1.11.0-DEV.208" + add!(fpm, FinalLowerGCPass()) + end + end + if VERSION < v"1.11.0-DEV.208" + add!(mpm, FinalLowerGCPass()) + end + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, GVNPass()) + add!(fpm, SCCPPass()) + add!(fpm, DCEPass()) + end + add!(mpm, LowerPTLSPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstructionCombiningPass()) + # TODO: from libEnzyme + # add!(fpm, JLInstSimplifyPass()) + add!(fpm, SimplifyCFGPass(; aggressiveSimplifyCFGOptions...)) + end + else + add!(mpm, RemoveNIPass()) + end +end + function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool = true) addr13NoAlias(mod) removeDeadArgs!(mod, tm) @@ -770,7 +821,21 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool end end else - # TODO(NewPM) + @dispose pb = NewPMPassBuilder() begin + register!(pb, ReinsertGCMarkerPass()) + add!(pb, NewPMModulePassManager()) do mpm + # TODO(NewPM) + # addTargetPasses!(mpm, tm, LLVM.triple(mod)) + # addOptimizationPasses!(mpm, tm) + end + if machine + add!(pb, NewPMModulePassManager()) do mpm + addJuliaLegalizationPasses_newPM!(mpm, true) + addMachinePasses_newPM!(mpm) + end + end + run!(pb, mod, tm) + end end for f in functions(mod) if isempty(blocks(f)) From 2ffa6edd8f5cf1ca1e89ced49d37317725a5cd73 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 22 Oct 2025 20:28:28 +0200 Subject: [PATCH 04/22] Minimally working! --- src/compiler/optimize.jl | 7 ++++- src/llvm/transforms.jl | 65 +++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 1c3b272bdd..ce438ef822 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -763,9 +763,14 @@ function addJuliaLegalizationPasses_newPM!(mpm::LLVM.NewPMPassManager, lower_int end add!(mpm, LowerPTLSPass()) add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, InstructionCombiningPass()) + add!(fpm, InstCombinePass()) # TODO: from libEnzyme # add!(fpm, JLInstSimplifyPass()) + aggressiveSimplifyCFGOptions = + (forward_switch_cond=true, + switch_range_to_icmp=true, + switch_to_lookup=true, + hoist_common_insts=true) add!(fpm, SimplifyCFGPass(; aggressiveSimplifyCFGOptions...)) end else diff --git a/src/llvm/transforms.jl b/src/llvm/transforms.jl index a190c41aa9..25785bac25 100644 --- a/src/llvm/transforms.jl +++ b/src/llvm/transforms.jl @@ -2401,10 +2401,15 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) # and including 12 (but fixed 13+), Attributor will incorrectly change functions that # call code with undef to become unreachable, even when there exist other valid # callsites. See: https://godbolt.org/z/9Y3Gv6q5M - ModulePassManager() do pm - global_dce!(pm) - LLVM.run!(pm, mod) + if LLVM.has_oldpm() + ModulePassManager() do pm + global_dce!(pm) + LLVM.run!(pm, mod) + end + else + run!(GlobalDCEPass(), mod) end + # Prevent dead-arg-elimination of functions which we may require args for in the derivative funcT = LLVM.FunctionType(LLVM.VoidType(), LLVMType[], vararg = true) if LLVM.version().major <= 15 @@ -2560,37 +2565,49 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) end end propagate_returned!(mod) - ModulePassManager() do pm - instruction_combining!(pm) - jl_inst_simplify!(pm) - alloc_opt_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - cse!(pm) - LLVM.run!(pm, mod) + if LLVM.has_oldpm() + ModulePassManager() do pm + instruction_combining!(pm) + jl_inst_simplify!(pm) + alloc_opt_tm!(pm, tm) + scalar_repl_aggregates_ssa!(pm) # SSA variant? + cse!(pm) + LLVM.run!(pm, mod) + end + else + # TODO(NewPM) end propagate_returned!(mod) pre_attr!(mod, RunAttributor[]) if RunAttributor[] - if LLVM.version().major >= 13 - ModulePassManager() do pm - API.EnzymeAddAttributorLegacyPass(pm) - LLVM.run!(pm, mod) + if LLVM.has_oldpm() + if LLVM.version().major >= 13 + ModulePassManager() do pm + API.EnzymeAddAttributorLegacyPass(pm) + LLVM.run!(pm, mod) + end end + else + # TODO(NewPM) end end propagate_returned!(mod) - ModulePassManager() do pm - instruction_combining!(pm) - jl_inst_simplify!(pm) - alloc_opt_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - if RunAttributor[] - if LLVM.version().major >= 13 - API.EnzymeAddAttributorLegacyPass(pm) + if LLVM.has_oldpm() + ModulePassManager() do pm + instruction_combining!(pm) + jl_inst_simplify!(pm) + alloc_opt_tm!(pm, tm) + scalar_repl_aggregates_ssa!(pm) # SSA variant? + if RunAttributor[] + if LLVM.version().major >= 13 + API.EnzymeAddAttributorLegacyPass(pm) + end end + cse!(pm) + LLVM.run!(pm, mod) end - cse!(pm) - LLVM.run!(pm, mod) + else + # TODO(NewPM) end post_attr!(mod, RunAttributor[]) propagate_returned!(mod) From 638fb2c26bf91fb2ec5578f5c90332f2317b843d Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 22 Oct 2025 20:59:15 +0200 Subject: [PATCH 05/22] handle pass from libEnzyme --- src/compiler/optimize.jl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index ce438ef822..cea5aa9d16 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -1,3 +1,10 @@ +function registerEnzymeAndPassPipeline!(pb::NewPMPassBuilder) + enzyme_callback = cglobal((:registerEnzymeAndPassPipeline, API.libEnzyme)) + LLVM.API.LLVMPassBuilderExtensionsPushRegistrationCallbacks(pb.exts, enzyme_callback) +end + +LLVM.@function_pass "jl-inst-simplify" JLInstSimplifyPass + struct PipelineConfig Speedup::Cint Size::Cint @@ -764,8 +771,7 @@ function addJuliaLegalizationPasses_newPM!(mpm::LLVM.NewPMPassManager, lower_int add!(mpm, LowerPTLSPass()) add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, InstCombinePass()) - # TODO: from libEnzyme - # add!(fpm, JLInstSimplifyPass()) + add!(fpm, JLInstSimplifyPass()) aggressiveSimplifyCFGOptions = (forward_switch_cond=true, switch_range_to_icmp=true, @@ -827,6 +833,7 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool end else @dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) register!(pb, ReinsertGCMarkerPass()) add!(pb, NewPMModulePassManager()) do mpm # TODO(NewPM) From b6207e2233ecc91a291a27b32eda3269f8b35356 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 24 Oct 2025 16:15:38 +0200 Subject: [PATCH 06/22] add attributor pass --- src/llvm/transforms.jl | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/llvm/transforms.jl b/src/llvm/transforms.jl index 25785bac25..46f4fb742c 100644 --- a/src/llvm/transforms.jl +++ b/src/llvm/transforms.jl @@ -2575,7 +2575,19 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) LLVM.run!(pm, mod) end else - # TODO(NewPM) + LLVM.@dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass()) + end + end + LLVM.run!(pb, mod) + end end propagate_returned!(mod) pre_attr!(mod, RunAttributor[]) @@ -2588,7 +2600,7 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) end end else - # TODO(NewPM) + LLVM.run!(AttributorPass(), mod) end end propagate_returned!(mod) @@ -2607,7 +2619,22 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) LLVM.run!(pm, mod) end else - # TODO(NewPM) + LLVM.@dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + end + add!(mpm, AttributorPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, EarlyCSEPass()) + end + end + LLVM.run!(pb, mod) + end end post_attr!(mod, RunAttributor[]) propagate_returned!(mod) From 4eec6b28274be861beca157e0ff686fd3469382c Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 24 Oct 2025 16:32:08 +0200 Subject: [PATCH 07/22] add expose options --- src/compiler/optimize.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index cea5aa9d16..5307ab676b 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -76,6 +76,15 @@ function run_jl_pipeline(pm::ModulePassManager, tm::LLVM.TargetMachine; kwargs.. add!(pm, ModulePass("JLPipeline", jl_pipeline)) end +function julia_pipeline(pb, mpm; kwargs...) + config = Ref(pipeline_options(; kwargs...)) + @ccall jl_build_newpm_pipeline( + mpm.ref::Ptr{Cvoid}, + pb.ref::Ptr{Cvoid}, + config::Ptr{PipelineConfig}, + )::Cvoid +end + @static if VERSION < v"1.11.0-DEV.428" else barrier_noop!(pm) = nothing From c66567b3fc6e824aa2e584fb67eceff4cae48456 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 24 Oct 2025 16:38:56 +0200 Subject: [PATCH 08/22] use newpm as default --- src/compiler.jl | 4 ++-- src/compiler/optimize.jl | 1 + src/llvm/transforms.jl | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/compiler.jl b/src/compiler.jl index 26c37d6f6e..33460ec486 100644 --- a/src/compiler.jl +++ b/src/compiler.jl @@ -2752,7 +2752,7 @@ function enzyme!( for f in collect(functions(mod)) API.EnzymeFixupBatchedJuliaCallingConvention(f) end - if LLVM.has_oldpm() + if !LLVM.has_newpm() ModulePassManager() do pm dce!(pm) LLVM.run!(pm, mod) @@ -5168,7 +5168,7 @@ end push!(toremove, name(f)) end end - if LLVM.has_oldpm() + if !LLVM.has_newpm() ModulePassManager() do pm always_inliner!(pm) LLVM.run!(pm, mod) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 5307ab676b..bca6a30d63 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -825,6 +825,7 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool ), ) end + # TODO(NewPM): Swap conditionals when the pipeline is ready if LLVM.has_oldpm() LLVM.ModulePassManager() do pm addTargetPasses!(pm, tm, LLVM.triple(mod)) diff --git a/src/llvm/transforms.jl b/src/llvm/transforms.jl index 46f4fb742c..c5eedb0699 100644 --- a/src/llvm/transforms.jl +++ b/src/llvm/transforms.jl @@ -2401,7 +2401,7 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) # and including 12 (but fixed 13+), Attributor will incorrectly change functions that # call code with undef to become unreachable, even when there exist other valid # callsites. See: https://godbolt.org/z/9Y3Gv6q5M - if LLVM.has_oldpm() + if !LLVM.has_newpm() ModulePassManager() do pm global_dce!(pm) LLVM.run!(pm, mod) @@ -2565,7 +2565,7 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) end end propagate_returned!(mod) - if LLVM.has_oldpm() + if !LLVM.has_newpm() ModulePassManager() do pm instruction_combining!(pm) jl_inst_simplify!(pm) @@ -2592,7 +2592,7 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) propagate_returned!(mod) pre_attr!(mod, RunAttributor[]) if RunAttributor[] - if LLVM.has_oldpm() + if !LLVM.has_newpm() if LLVM.version().major >= 13 ModulePassManager() do pm API.EnzymeAddAttributorLegacyPass(pm) @@ -2604,7 +2604,7 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) end end propagate_returned!(mod) - if LLVM.has_oldpm() + if !LLVM.has_newpm() ModulePassManager() do pm instruction_combining!(pm) jl_inst_simplify!(pm) From c83779d7c7370735d866d9d9da98660dc06acf52 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 24 Oct 2025 19:12:54 +0200 Subject: [PATCH 09/22] use Enzyme variant of AttributorPass --- src/compiler/optimize.jl | 13 +++++++++++++ src/llvm/transforms.jl | 11 +++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index bca6a30d63..cf4c991443 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -5,6 +5,19 @@ end LLVM.@function_pass "jl-inst-simplify" JLInstSimplifyPass +function enzyme_attributor_pass!(mod::LLVM.Module) + ccall( + (:LLVMRunEnzymeAttributorPass, API.libEnzyme), + Cvoid, + (LLVM.API.LLVMModuleRef,), + mod, + ) + return true +end + +EnzymeAttributorPass() = NewPMModulePass("enzyme_attributor", enzyme_attributor_pass!) + + struct PipelineConfig Speedup::Cint Size::Cint diff --git a/src/llvm/transforms.jl b/src/llvm/transforms.jl index c5eedb0699..ad973e2494 100644 --- a/src/llvm/transforms.jl +++ b/src/llvm/transforms.jl @@ -2600,7 +2600,13 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) end end else - LLVM.run!(AttributorPass(), mod) + LLVM.@dispose pb = NewPMPassBuilder() begin + register!(pb, EnzymeAttributorPass()) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, EnzymeAttributorPass()) + end + LLVM.run!(pb, mod) + end end end propagate_returned!(mod) @@ -2621,6 +2627,7 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) else LLVM.@dispose pb = NewPMPassBuilder() begin registerEnzymeAndPassPipeline!(pb) + register!(pb, EnzymeAttributorPass()) add!(pb, NewPMModulePassManager()) do mpm add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, InstCombinePass()) @@ -2628,7 +2635,7 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) add!(fpm, AllocOptPass()) add!(fpm, SROAPass()) end - add!(mpm, AttributorPass()) + add!(mpm, EnzymeAttributorPass()) add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, EarlyCSEPass()) end From 4c1d9d8a2592a625fedc8ad8cf860d60c74e7339 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 24 Oct 2025 19:29:44 +0200 Subject: [PATCH 10/22] fixup! use Enzyme variant of AttributorPass --- src/compiler/optimize.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index cf4c991443..ae6735df10 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -7,7 +7,7 @@ LLVM.@function_pass "jl-inst-simplify" JLInstSimplifyPass function enzyme_attributor_pass!(mod::LLVM.Module) ccall( - (:LLVMRunEnzymeAttributorPass, API.libEnzyme), + (:RunAttributorOnModule, API.libEnzyme), Cvoid, (LLVM.API.LLVMModuleRef,), mod, From fce14c0dc93062d426356821ae8523e3c4aa1d59 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 24 Oct 2025 19:45:32 +0200 Subject: [PATCH 11/22] all version of LLVM support NewPM --- src/compiler.jl | 18 +------- src/llvm/transforms.jl | 101 ++++++++++++----------------------------- 2 files changed, 31 insertions(+), 88 deletions(-) diff --git a/src/compiler.jl b/src/compiler.jl index 33460ec486..e6ea66c331 100644 --- a/src/compiler.jl +++ b/src/compiler.jl @@ -2752,14 +2752,7 @@ function enzyme!( for f in collect(functions(mod)) API.EnzymeFixupBatchedJuliaCallingConvention(f) end - if !LLVM.has_newpm() - ModulePassManager() do pm - dce!(pm) - LLVM.run!(pm, mod) - end - else - run!(DCEPass(), mod) - end + run!(DCEPass(), mod) fix_decayaddr!(mod) adjointf = adjointf == nothing ? nothing : functions(mod)[adjointfname] augmented_primalf = @@ -5168,14 +5161,7 @@ end push!(toremove, name(f)) end end - if !LLVM.has_newpm() - ModulePassManager() do pm - always_inliner!(pm) - LLVM.run!(pm, mod) - end - else - run!(AlwaysInlinerPass(), mod) - end + run!(AlwaysInlinerPass(), mod) for fname in toremove if haskey(functions(mod), fname) f = functions(mod)[fname] diff --git a/src/llvm/transforms.jl b/src/llvm/transforms.jl index ad973e2494..4ab3344b95 100644 --- a/src/llvm/transforms.jl +++ b/src/llvm/transforms.jl @@ -2401,14 +2401,7 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) # and including 12 (but fixed 13+), Attributor will incorrectly change functions that # call code with undef to become unreachable, even when there exist other valid # callsites. See: https://godbolt.org/z/9Y3Gv6q5M - if !LLVM.has_newpm() - ModulePassManager() do pm - global_dce!(pm) - LLVM.run!(pm, mod) - end - else - run!(GlobalDCEPass(), mod) - end + run!(GlobalDCEPass(), mod) # Prevent dead-arg-elimination of functions which we may require args for in the derivative funcT = LLVM.FunctionType(LLVM.VoidType(), LLVMType[], vararg = true) @@ -2565,84 +2558,48 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) end end propagate_returned!(mod) - if !LLVM.has_newpm() - ModulePassManager() do pm - instruction_combining!(pm) - jl_inst_simplify!(pm) - alloc_opt_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - cse!(pm) - LLVM.run!(pm, mod) - end - else - LLVM.@dispose pb = NewPMPassBuilder() begin - registerEnzymeAndPassPipeline!(pb) - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, InstCombinePass()) - add!(fpm, JLInstSimplifyPass()) - add!(fpm, AllocOptPass()) - add!(fpm, SROAPass()) - add!(fpm, EarlyCSEPass()) - end + LLVM.@dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass()) end - LLVM.run!(pb, mod) end + LLVM.run!(pb, mod) end propagate_returned!(mod) pre_attr!(mod, RunAttributor[]) if RunAttributor[] - if !LLVM.has_newpm() - if LLVM.version().major >= 13 - ModulePassManager() do pm - API.EnzymeAddAttributorLegacyPass(pm) - LLVM.run!(pm, mod) - end - end - else - LLVM.@dispose pb = NewPMPassBuilder() begin - register!(pb, EnzymeAttributorPass()) - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, EnzymeAttributorPass()) - end - LLVM.run!(pb, mod) - end - end - end - propagate_returned!(mod) - if !LLVM.has_newpm() - ModulePassManager() do pm - instruction_combining!(pm) - jl_inst_simplify!(pm) - alloc_opt_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - if RunAttributor[] - if LLVM.version().major >= 13 - API.EnzymeAddAttributorLegacyPass(pm) - end - end - cse!(pm) - LLVM.run!(pm, mod) - end - else LLVM.@dispose pb = NewPMPassBuilder() begin - registerEnzymeAndPassPipeline!(pb) register!(pb, EnzymeAttributorPass()) add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, InstCombinePass()) - add!(fpm, JLInstSimplifyPass()) - add!(fpm, AllocOptPass()) - add!(fpm, SROAPass()) - end add!(mpm, EnzymeAttributorPass()) - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, EarlyCSEPass()) - end end LLVM.run!(pb, mod) end end + propagate_returned!(mod) + LLVM.@dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + register!(pb, EnzymeAttributorPass()) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + end + add!(mpm, EnzymeAttributorPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, EarlyCSEPass()) + end + end + LLVM.run!(pb, mod) + end post_attr!(mod, RunAttributor[]) propagate_returned!(mod) From ddf06a4b044823e89725e67a15cc4df7b58222fa Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 24 Oct 2025 21:21:22 +0200 Subject: [PATCH 12/22] fixup pipeline_options --- src/compiler/optimize.jl | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index ae6735df10..b4ebce7601 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -50,8 +50,8 @@ function pipeline_options(; enable_vector_pipeline::Bool = true, remove_ni::Bool = true, cleanup::Bool = true, - Size::Cint = 0, - Speedup::Cint = 3, + Size::Cint = Cint(0), + Speedup::Cint = Cint(3), ) return PipelineConfig( Speedup, @@ -872,6 +872,35 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool run!(pb, mod, tm) end end + # Wanted to use this but julia_pipeline is not ready for prime time + # @dispose pb = NewPMPassBuilder() begin + # registerEnzymeAndPassPipeline!(pb) + # register!(pb, ReinsertGCMarkerPass()) + + # add!(pb, NewPMModulePassManager()) do mpm + # if machine + # add!(mpm, NewPMFunctionPassManager()) do fpm + # add!(fpm, ReinsertGCMarkerPass()) + # end + # end + + # julia_pipeline(pb, mpm; + # lower_intrinsics = machine, + # dump_native = false, + # external_use = false, + # llvm_only = false, + # always_inline = true, + # enable_early_simplifications = true, + # enable_early_optimizations = true, + # enable_scalar_optimizations = true, + # enable_loop_optimizations = true, + # enable_vector_pipeline = true, + # remove_ni = true, + # cleanup = true, + # ) + # end + # run!(pb, mod, tm) + # end for f in functions(mod) if isempty(blocks(f)) continue From 30938be7d06005b3638a2c71a3f11d85ef4229f8 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 24 Oct 2025 21:31:07 +0200 Subject: [PATCH 13/22] remote dead-code pipeline_options --- src/compiler/optimize.jl | 209 +++++---------------------------------- 1 file changed, 22 insertions(+), 187 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index b4ebce7601..766f5e55a1 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -5,6 +5,8 @@ end LLVM.@function_pass "jl-inst-simplify" JLInstSimplifyPass +const RunAttributor = Ref(true) + function enzyme_attributor_pass!(mod::LLVM.Module) ccall( (:RunAttributorOnModule, API.libEnzyme), @@ -17,87 +19,6 @@ end EnzymeAttributorPass() = NewPMModulePass("enzyme_attributor", enzyme_attributor_pass!) - -struct PipelineConfig - Speedup::Cint - Size::Cint - lower_intrinsics::Cint - dump_native::Cint - external_use::Cint - llvm_only::Cint - always_inline::Cint - enable_early_simplifications::Cint - enable_early_optimizations::Cint - enable_scalar_optimizations::Cint - enable_loop_optimizations::Cint - enable_vector_pipeline::Cint - remove_ni::Cint - cleanup::Cint -end - -const RunAttributor = Ref(true) - -function pipeline_options(; - lower_intrinsics::Bool = true, - dump_native::Bool = false, - external_use::Bool = false, - llvm_only::Bool = false, - always_inline::Bool = true, - enable_early_simplifications::Bool = true, - enable_early_optimizations::Bool = true, - enable_scalar_optimizations::Bool = true, - enable_loop_optimizations::Bool = true, - enable_vector_pipeline::Bool = true, - remove_ni::Bool = true, - cleanup::Bool = true, - Size::Cint = Cint(0), - Speedup::Cint = Cint(3), -) - return PipelineConfig( - Speedup, - Size, - lower_intrinsics, - dump_native, - external_use, - llvm_only, - always_inline, - enable_early_simplifications, - enable_early_optimizations, - enable_scalar_optimizations, - enable_loop_optimizations, - enable_vector_pipeline, - remove_ni, - cleanup, - ) -end - -function run_jl_pipeline(pm::ModulePassManager, tm::LLVM.TargetMachine; kwargs...) - config = Ref(pipeline_options(; kwargs...)) - function jl_pipeline(m) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - @ccall jl_build_newpm_pipeline( - mpm.ref::Ptr{Cvoid}, - pb.ref::Ptr{Cvoid}, - config::Ptr{PipelineConfig}, - )::Cvoid - end - LLVM.run!(mpm, m, tm) - end - return true - end - add!(pm, ModulePass("JLPipeline", jl_pipeline)) -end - -function julia_pipeline(pb, mpm; kwargs...) - config = Ref(pipeline_options(; kwargs...)) - @ccall jl_build_newpm_pipeline( - mpm.ref::Ptr{Cvoid}, - pb.ref::Ptr{Cvoid}, - config::Ptr{PipelineConfig}, - )::Cvoid -end - @static if VERSION < v"1.11.0-DEV.428" else barrier_noop!(pm) = nothing @@ -233,86 +154,32 @@ else end end - function loop_optimizations_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - @static if true || VERSION < v"1.11-" - lower_simdloop_tm!(pm, tm) - licm!(pm) - if LLVM.version() >= v"15" - simple_loop_unswitch_legacy!(pm) - else - loop_unswitch!(pm) - end + lower_simdloop_tm!(pm, tm) + licm!(pm) + if LLVM.version() >= v"15" + simple_loop_unswitch_legacy!(pm) else - run_jl_pipeline( - pm, - tm; - lower_intrinsics = false, - dump_native = false, - external_use = false, - llvm_only = false, - always_inline = false, - enable_early_simplifications = false, - enable_early_optimizations = false, - enable_scalar_optimizations = false, - enable_loop_optimizations = true, - enable_vector_pipeline = false, - remove_ni = false, - cleanup = false, - ) + loop_unswitch!(pm) end end - function more_loop_optimizations_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - @static if true || VERSION < v"1.11-" - loop_rotate!(pm) - # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) - loop_idiom!(pm) - - # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards - lower_simdloop_tm!(pm, tm) # Annotate loop marked with "loopinfo" as LLVM parallel loop - licm!(pm) - julia_licm_tm!(pm, tm) - # Subsequent passes not stripping metadata from terminator - instruction_combining!(pm) # TODO: createInstSimplifyLegacy - jl_inst_simplify!(pm) + loop_rotate!(pm) + # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) + loop_idiom!(pm) + + # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards + lower_simdloop_tm!(pm, tm) # Annotate loop marked with "loopinfo" as LLVM parallel loop + licm!(pm) + julia_licm_tm!(pm, tm) + # Subsequent passes not stripping metadata from terminator + instruction_combining!(pm) # TODO: createInstSimplifyLegacy + jl_inst_simplify!(pm) - ind_var_simplify!(pm) - loop_deletion!(pm) - loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll - else - # LowerSIMDLoopPass - # LoopRotatePass [opt >= 2] - # LICMPass - # JuliaLICMPass - # SimpleLoopUnswitchPass - # LICMPass - # JuliaLICMPass - # IRCEPass - # LoopInstSimplifyPass - # - in ours this is instcombine with jlinstsimplify - # LoopIdiomRecognizePass - # IndVarSimplifyPass - # LoopDeletionPass - # LoopFullUnrollPass - run_jl_pipeline( - pm, - tm; - lower_intrinsics = false, - dump_native = false, - external_use = false, - llvm_only = false, - always_inline = false, - enable_early_simplifications = false, - enable_early_optimizations = false, - enable_scalar_optimizations = false, - enable_loop_optimizations = true, - enable_vector_pipeline = false, - remove_ni = false, - cleanup = false, - ) - end + ind_var_simplify!(pm) + loop_deletion!(pm) + loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll end @static if VERSION < v"1.11-" @@ -860,11 +727,8 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool register!(pb, ReinsertGCMarkerPass()) add!(pb, NewPMModulePassManager()) do mpm # TODO(NewPM) - # addTargetPasses!(mpm, tm, LLVM.triple(mod)) # addOptimizationPasses!(mpm, tm) - end - if machine - add!(pb, NewPMModulePassManager()) do mpm + if machine addJuliaLegalizationPasses_newPM!(mpm, true) addMachinePasses_newPM!(mpm) end @@ -872,35 +736,6 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool run!(pb, mod, tm) end end - # Wanted to use this but julia_pipeline is not ready for prime time - # @dispose pb = NewPMPassBuilder() begin - # registerEnzymeAndPassPipeline!(pb) - # register!(pb, ReinsertGCMarkerPass()) - - # add!(pb, NewPMModulePassManager()) do mpm - # if machine - # add!(mpm, NewPMFunctionPassManager()) do fpm - # add!(fpm, ReinsertGCMarkerPass()) - # end - # end - - # julia_pipeline(pb, mpm; - # lower_intrinsics = machine, - # dump_native = false, - # external_use = false, - # llvm_only = false, - # always_inline = true, - # enable_early_simplifications = true, - # enable_early_optimizations = true, - # enable_scalar_optimizations = true, - # enable_loop_optimizations = true, - # enable_vector_pipeline = true, - # remove_ni = true, - # cleanup = true, - # ) - # end - # run!(pb, mod, tm) - # end for f in functions(mod) if isempty(blocks(f)) continue From 94c51237877e134e17964673b7b66e350867d196 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 27 Oct 2025 16:59:20 +0100 Subject: [PATCH 14/22] translate more of the pipline and remove old code --- src/compiler/optimize.jl | 490 ++++++++++----------------------------- 1 file changed, 119 insertions(+), 371 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 766f5e55a1..e966c5960f 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -18,32 +18,8 @@ function enzyme_attributor_pass!(mod::LLVM.Module) end EnzymeAttributorPass() = NewPMModulePass("enzyme_attributor", enzyme_attributor_pass!) - -@static if VERSION < v"1.11.0-DEV.428" -else - barrier_noop!(pm) = nothing -end - -@static if VERSION < v"1.11-" - function gc_invariant_verifier_tm!(pm::ModulePassManager, tm::LLVM.TargetMachine, cond::Bool) - gc_invariant_verifier!(pm, cond) - end -else - function gc_invariant_verifier_tm!(pm::ModulePassManager, tm::LLVM.TargetMachine, cond::Bool) - function gc_invariant_verifier(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, GCInvariantVerifierPass(; strong = cond)) - end - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("GCInvariantVerifier", gc_invariant_verifier)) - end -end +ReinsertGCMarkerPass() = NewPMFunctionPass("reinsert_gcmarker", reinsert_gcmarker_pass!) +SafeAtomicToRegularStorePass() = NewPMFunctionPass("safe_atomic_to_regular_store", safe_atomic_to_regular_store!) @static if VERSION < v"1.11-" function propagate_julia_addrsp_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) @@ -87,49 +63,6 @@ else end end -@static if VERSION < v"1.11-" - function remove_ni_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - remove_ni!(pm) - end -else - function remove_ni_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function remove_ni(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, RemoveNIPass()) - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("RemoveNI", remove_ni)) - end -end - -@static if VERSION < v"1.11-" - function julia_licm_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - julia_licm!(pm) - end -else - function julia_licm_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function julia_licm(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, NewPMLoopPassManager()) do lpm - add!(lpm, JuliaLICMPass()) - end - end - end - run!(pb, mod) - end - return true - end - # really looppass - add!(pm, ModulePass("JuliaLICM", julia_licm)) - end -end - @static if VERSION < v"1.11-" function lower_simdloop_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) lower_simdloop!(pm) @@ -164,148 +97,24 @@ function loop_optimizations_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachi end end -function more_loop_optimizations_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - loop_rotate!(pm) - # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) - loop_idiom!(pm) - - # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards - lower_simdloop_tm!(pm, tm) # Annotate loop marked with "loopinfo" as LLVM parallel loop - licm!(pm) - julia_licm_tm!(pm, tm) - # Subsequent passes not stripping metadata from terminator - instruction_combining!(pm) # TODO: createInstSimplifyLegacy - jl_inst_simplify!(pm) - - ind_var_simplify!(pm) - loop_deletion!(pm) - loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll -end - -@static if VERSION < v"1.11-" - function demote_float16_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - demote_float16!(pm) - end -else - function demote_float16_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function demote_float16(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, DemoteFloat16Pass()) - end - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("DemoteFloat16", demote_float16)) - end -end - -@static if VERSION < v"1.11-" - function lower_exc_handlers_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - lower_exc_handlers!(pm) - end -else - function lower_exc_handlers_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function lower_exc_handlers(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, LowerExcHandlersPass()) - end - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("LowerExcHandlers", lower_exc_handlers)) - end -end - -@static if VERSION < v"1.11-" - function lower_ptls_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, dump_native::Bool) - lower_ptls!(pm, dump_native) - end -else - function lower_ptls_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, dump_native::Bool) - function lower_ptls(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, LowerPTLSPass()) - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("LowerPTLS", lower_ptls)) - end -end - -@static if VERSION < v"1.11-" - function combine_mul_add_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - combine_mul_add!(pm) - end -else - function combine_mul_add_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) -@static if VERSION < v"1.12.0-DEV.1390" - function combine_mul_add(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, CombineMulAddPass()) - end - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("CombineMulAdd", combine_mul_add)) -end - end -end - -@static if VERSION < v"1.11-" - function late_lower_gc_frame_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - late_lower_gc_frame!(pm) - end -else - function late_lower_gc_frame_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function late_lower_gc_frame(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, LateLowerGCPass()) - end - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("LateLowerGCFrame", late_lower_gc_frame)) - end -end +function more_loop_optimizations_newPM!(fpm::LLVM.NewPMPassManager) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, LoopRotatePass()) + # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) + # add!(lpm, LoopIdiomPass()) TODO(NewPM): This seems to have gotten removed -@static if VERSION < v"1.11-" - function final_lower_gc_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - final_lower_gc!(pm) + # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards + add!(lpm, LowerSIMDLoopPass()) # Annotate loop marked with "loopinfo" as LLVM parallel loop + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) end -else - function final_lower_gc_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function final_lower_gc(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, FinalLowerGCPass()) - end - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("FinalLowerGCFrame", final_lower_gc)) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, IndVarSimplifyPass()) + add!(lpm, LoopDeletionPass()) end + add!(fpm, LoopUnrollPass(opt_level=2)) end @static if VERSION < v"1.11-" @@ -482,108 +291,90 @@ function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) nodecayed_phis!(mod) end -# https://github.com/JuliaLang/julia/blob/2eb5da0e25756c33d1845348836a0a92984861ac/src/aotcompile.cpp#L603 -function addTargetPasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, trip::String) - add_library_info!(pm, trip) - add_transform_info!(pm, tm) -end +function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, ReinsertGCMarkerPass()) + end -# https://github.com/JuliaLang/julia/blob/2eb5da0e25756c33d1845348836a0a92984861ac/src/aotcompile.cpp#L620 -function addOptimizationPasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - add!(pm, FunctionPass("ReinsertGCMarker", reinsert_gcmarker_pass!)) - - constant_merge!(pm) - - propagate_julia_addrsp_tm!(pm, tm) - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cfgsimplification!(pm) - dce!(pm) - scalar_repl_aggregates!(pm) - - # mem_cpy_opt!(pm) - - always_inliner!(pm) # Respect always_inline - - # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time - # merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` - # pass. - - alloc_opt_tm!(pm, tm) - # consider AggressiveInstCombinePass at optlevel > 2 - - instruction_combining!(pm) - jl_inst_simplify!(pm) - cfgsimplification!(pm) - scalar_repl_aggregates!(pm) - instruction_combining!(pm) # TODO: createInstSimplifyLegacy - jl_inst_simplify!(pm) - jump_threading!(pm) - correlated_value_propagation!(pm) - - reassociate!(pm) - - early_cse!(pm) - - # Load forwarding above can expose allocations that aren't actually used - # remove those before optimizing loops. - alloc_opt_tm!(pm, tm) - - more_loop_optimizations_tm!(pm, tm) - - # Run our own SROA on heap objects before LLVM's - alloc_opt_tm!(pm, tm) - # Re-run SROA after loop-unrolling (useful for small loops that operate, - # over the structure of an aggregate) - scalar_repl_aggregates!(pm) - instruction_combining!(pm) # TODO: createInstSimplifyLegacy - jl_inst_simplify!(pm) - - gvn!(pm) - mem_cpy_opt!(pm) - sccp!(pm) - - # Run instcombine after redundancy elimination to exploit opportunities - # opened up by them. - # This needs to be InstCombine instead of InstSimplify to allow - # loops over Union-typed arrays to vectorize. - instruction_combining!(pm) - jl_inst_simplify!(pm) - jump_threading!(pm) - dead_store_elimination!(pm) - add!(pm, FunctionPass("SafeAtomicToRegularStore", safe_atomic_to_regular_store!)) - - # More dead allocation (store) deletion before loop optimization - # consider removing this: - alloc_opt_tm!(pm, tm) - - # see if all of the constant folding has exposed more loops - # to simplification and deletion - # this helps significantly with cleaning up iteration - cfgsimplification!(pm) - loop_deletion!(pm) - instruction_combining!(pm) - jl_inst_simplify!(pm) - loop_vectorize!(pm) - # TODO: createLoopLoadEliminationPass - cfgsimplification!(pm) - slpvectorize!(pm) - # might need this after LLVM 11: - # TODO: createVectorCombinePass() - - aggressive_dce!(pm) -end + add!(mpm, ConstantMergePass()) + + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, PropagateJuliaAddrspacesPass()) + + add!(fpm, SimplifyCFGPass()) + add!(fpm, DCEPass()) + add!(fpm, SROAPass()) + end -function addMachinePasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - combine_mul_add_tm!(pm, tm) - # TODO: createDivRemPairs[] + add!(mpm, AlwaysInlinerPass()) - demote_float16_tm!(pm, tm) - gvn!(pm) + add!(mpm, NewPMFunctionPassManager()) do fpm + # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time + # merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` + # pass. + + + add!(fpm, AllocOptPass()) + # consider AggressiveInstCombinePass at optlevel > 2 + + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, SimplifyCFGPass()) + add!(fpm, SROAPass()) + add!(fpm, InstSimplifyPass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, CorrelatedValuePropagationPass()) + + add!(fpm, ReassociatePass()) + add!(fpm, EarlyCSEPass()) + + # Load forwarding above can expose allocations that aren't actually used + # remove those before optimizing loops. + add!(fpm, AllocOptPass()) + + more_loop_optimizations_newPM!(fpm) + + # Run our own SROA on heap objects before LLVM's + add!(fpm, AllocOptPass()) + # Re-run SROA after loop-unrolling (useful for small loops that operate, + # over the structure of an aggregate) + add!(fpm, SROAPass()) + add!(fpm, InstSimplifyPass()) + + add!(fpm, GVNPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, SCCPPass()) + + # Run instcombine after redundancy elimination to exploit opportunities + # opened up by them. + # This needs to be InstCombine instead of InstSimplify to allow + # loops over Union-typed arrays to vectorize. + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, DSEPass()) + add!(fpm, SafeAtomicToRegularStorePass()) + + # More dead allocation (store) deletion before loop optimization + # consider removing this: + add!(fpm, AllocOptPass()) + + # see if all of the constant folding has exposed more loops + # to simplification and deletion + # this helps significantly with cleaning up iteration + add!(fpm, SimplifyCFGPass()) + add!(fpm, LoopDeletionPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, LoopVectorizePass()) + add!(fpm, SimplifyCFGPass()) + add!(fpm, SLPVectorizerPass()) + add!(fpm, ADCEPass()) + end end -function addMachinePasses_newPM!(mpm::LLVM.NewPMPassManager) +function addMachinePasses!(mpm::LLVM.NewPMPassManager) add!(mpm, NewPMFunctionPassManager()) do fpm if VERSION < v"1.12.0-DEV.1390" add!(fpm, CombineMulAddPass()) @@ -594,44 +385,7 @@ function addMachinePasses_newPM!(mpm::LLVM.NewPMPassManager) end end -function addJuliaLegalizationPasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, lower_intrinsics::Bool = true) - if lower_intrinsics - # LowerPTLS removes an indirect call. As a result, it is likely to trigger - # LLVM's devirtualization heuristics, which would result in the entire - # pass pipeline being re-exectuted. Prevent this by inserting a barrier. - barrier_noop!(pm) - add!(pm, FunctionPass("ReinsertGCMarker", reinsert_gcmarker_pass!)) - lower_exc_handlers_tm!(pm, tm) - # BUDE.jl demonstrates a bug here TODO - gc_invariant_verifier_tm!(pm, tm, false) - verifier!(pm) - - # Needed **before** LateLowerGCFrame on LLVM < 12 - # due to bug in `CreateAlignmentAssumption`. - remove_ni_tm!(pm, tm) - late_lower_gc_frame_tm!(pm, tm) - final_lower_gc_tm!(pm, tm) - # We need these two passes and the instcombine below - # after GC lowering to let LLVM do some constant propagation on the tags. - # and remove some unnecessary write barrier checks. - gvn!(pm) - sccp!(pm) - # Remove dead use of ptls - dce!(pm) - lower_ptls_tm!(pm, tm, false) #=dump_native=# - instruction_combining!(pm) - jl_inst_simplify!(pm) - # Clean up write barrier and ptls lowering - cfgsimplification!(pm) - else - barrier_noop!(pm) - remove_ni_tm!(pm, tm) - end -end - -ReinsertGCMarkerPass() = NewPMFunctionPass("reinsert_gcmarker", reinsert_gcmarker_pass!) - -function addJuliaLegalizationPasses_newPM!(mpm::LLVM.NewPMPassManager, lower_intrinsics::Bool = true) +function addJuliaLegalizationPasses!(mpm::LLVM.NewPMPassManager, lower_intrinsics::Bool = true) if lower_intrinsics add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, ReinsertGCMarkerPass()) @@ -651,13 +405,18 @@ function addJuliaLegalizationPasses_newPM!(mpm::LLVM.NewPMPassManager, lower_int end if VERSION < v"1.11.0-DEV.208" add!(mpm, FinalLowerGCPass()) - end + end + # We need these two passes and the instcombine below + # after GC lowering to let LLVM do some constant propagation on the tags. + # and remove some unnecessary write barrier checks. add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, GVNPass()) add!(fpm, SCCPPass()) + # Remove dead use of ptls add!(fpm, DCEPass()) end add!(mpm, LowerPTLSPass()) + # Clean up write barrier and ptls lowering add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, InstCombinePass()) add!(fpm, JLInstSimplifyPass()) @@ -705,36 +464,25 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool ), ) end - # TODO(NewPM): Swap conditionals when the pipeline is ready - if LLVM.has_oldpm() - LLVM.ModulePassManager() do pm - addTargetPasses!(pm, tm, LLVM.triple(mod)) - addOptimizationPasses!(pm, tm) - LLVM.run!(pm, mod) - end - if machine - # TODO enable validate_return_roots - # validate_return_roots!(mod) - LLVM.ModulePassManager() do pm - addJuliaLegalizationPasses!(pm, tm, true) - addMachinePasses!(pm, tm) - LLVM.run!(pm, mod) - end + @dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + register!(pb, ReinsertGCMarkerPass()) + register!(pb, SafeAtomicToRegularStorePass()) + add!(pb, NewPMAAManager()) do aam + add!(aam, ScopedNoAliasAA()) + add!(aam, TypeBasedAA()) + add!(aam, BasicAA()) end - else - @dispose pb = NewPMPassBuilder() begin - registerEnzymeAndPassPipeline!(pb) - register!(pb, ReinsertGCMarkerPass()) - add!(pb, NewPMModulePassManager()) do mpm - # TODO(NewPM) - # addOptimizationPasses!(mpm, tm) - if machine - addJuliaLegalizationPasses_newPM!(mpm, true) - addMachinePasses_newPM!(mpm) - end + add!(pb, NewPMModulePassManager()) do mpm + addOptimizationPasses!(mpm) + if machine + # TODO enable validate_return_roots + # validate_return_roots!(mod) + addJuliaLegalizationPasses!(mpm, true) + addMachinePasses!(mpm) end - run!(pb, mod, tm) end + run!(pb, mod, tm) end for f in functions(mod) if isempty(blocks(f)) From 95ddfc4d57b9d4409caafd40a551da79ee4efcc7 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 27 Oct 2025 18:55:55 +0100 Subject: [PATCH 15/22] finish pipeline translation to NewPM --- src/compiler/optimize.jl | 378 ++++++++++++--------------------------- 1 file changed, 119 insertions(+), 259 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index e966c5960f..68cd62bb3a 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -20,275 +20,118 @@ end EnzymeAttributorPass() = NewPMModulePass("enzyme_attributor", enzyme_attributor_pass!) ReinsertGCMarkerPass() = NewPMFunctionPass("reinsert_gcmarker", reinsert_gcmarker_pass!) SafeAtomicToRegularStorePass() = NewPMFunctionPass("safe_atomic_to_regular_store", safe_atomic_to_regular_store!) +Addr13NoAliasPass() = NewPMModulePass("addr13_noalias", addr13NoAlias) +RewriteGenericMemoryPass() = NewPMModulePass("rewrite_generic_memory", rewrite_generic_memory) -@static if VERSION < v"1.11-" - function propagate_julia_addrsp_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - propagate_julia_addrsp!(pm) - end -else - function propagate_julia_addrsp_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function prop_julia_addr(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, PropagateJuliaAddrspacesPass()) - end - end - run!(pb, mod) - end - return true +function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) + @dispose pb = NewPMPassBuilder() begin + register!(pb, Addr13NoAliasPass()) + register!(pb, RewriteGenericMemoryPass()) + add!(pb, NewPMAAManager()) do aam + add!(aam, ScopedNoAliasAA()) + add!(aam, TypeBasedAA()) + add!(aam, BasicAA()) end - add!(pm, ModulePass("PropagateJuliaAddrSpace", prop_julia_addr)) - end -end + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, Addr13NoAliasPass()) + add!(mpm, PropagateJuliaAddrspacesPass()) + + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, SimplifyCFGPass()) + add!(fpm, DCEPass()) + add!(fpm, CPUFeaturesPass()) + add!(fpm, SROAPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, AlwaysInlinerPass()) + add!(fpm, AllocOptPass()) + end + + add!(mpm, GlobalOptPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, GVNPass()) + end -@static if VERSION < v"1.11-" - function alloc_opt_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - alloc_opt!(pm) - end -else - function alloc_opt_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function alloc_opt(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, AllocOptPass()) - end + add!(mpm, RewriteGenericMemoryPass()) + + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, SimplifyCFGPass()) + add!(fpm, SROAPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, CorrelatedValuePropagationPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, ReassociatePass()) + add!(fpm, EarlyCSEPass()) + add!(fpm, AllocOptPass()) + add!(fpm, NewPMLoopPassManager()) do lpm + # TODO(NewPM) + # loop idiom + add!(lpm, LoopRotatePass()) + add!(lpm, LowerSIMDLoopPass()) + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) + add!(lpm, SimpleLoopUnswitchPass()) end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("AllocOpt", alloc_opt)) - end -end -@static if VERSION < v"1.11-" - function lower_simdloop_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - lower_simdloop!(pm) - end -else - function lower_simdloop_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function lower_simdloop(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, NewPMLoopPassManager()) do lpm - add!(lpm, LowerSIMDLoopPass()) - end - end + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, IndVarSimplifyPass()) + add!(lpm, LoopDeletionPass()) end - run!(pb, mod) + add!(fpm, LoopUnrollPass(opt_level=2)) + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + add!(fpm, GVNPass()) + + # This InstCombine needs to be after GVN + # Otherwise it will generate load chains in GPU code... + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, SCCPPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, DSEPass()) + add!(fpm, AllocOptPass()) + add!(fpm, SimplifyCFGPass()) + + + # TODO(NewPM) + # loop idiom + # loop deletion + add!(fpm, JumpThreadingPass()) + add!(fpm, CorrelatedValuePropagationPass()) + + add!(fpm, ADCEPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + + # GC passes + add!(fpm, GCInvariantVerifierPass(strong=false)) + add!(fpm, SimplifyCFGPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) end - return true - end - # really looppass - add!(pm, ModulePass("LowerSIMDLoop", lower_simdloop)) - end -end - -function loop_optimizations_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - lower_simdloop_tm!(pm, tm) - licm!(pm) - if LLVM.version() >= v"15" - simple_loop_unswitch_legacy!(pm) - else - loop_unswitch!(pm) - end -end - -function more_loop_optimizations_newPM!(fpm::LLVM.NewPMPassManager) - add!(fpm, NewPMLoopPassManager()) do lpm - add!(lpm, LoopRotatePass()) - # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) - # add!(lpm, LoopIdiomPass()) TODO(NewPM): This seems to have gotten removed - - # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards - add!(lpm, LowerSIMDLoopPass()) # Annotate loop marked with "loopinfo" as LLVM parallel loop - add!(lpm, LICMPass()) - add!(lpm, JuliaLICMPass()) - end - add!(fpm, InstCombinePass()) - add!(fpm, JLInstSimplifyPass()) - add!(fpm, NewPMLoopPassManager()) do lpm - add!(lpm, IndVarSimplifyPass()) - add!(lpm, LoopDeletionPass()) - end - add!(fpm, LoopUnrollPass(opt_level=2)) -end -@static if VERSION < v"1.11-" - function cpu_features_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - @static if isdefined(LLVM.Interop, :cpu_features!) - LLVM.Interop.cpu_features!(pm) - else - @static if isdefined(GPUCompiler, :cpu_features!) - GPUCompiler.cpu_features!(pm) + add!(mpm, GlobalOptPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, GVNPass()) end end - end -else - function cpu_features_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function cpu_features(mod) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, CPUFeaturesPass()) - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("CPUFeatures", cpu_features)) - end -end - -function jl_inst_simplify!(PM::LLVM.ModulePassManager) - ccall( - (:LLVMAddJLInstSimplifyPass, API.libEnzyme), - Cvoid, - (LLVM.API.LLVMPassManagerRef,), - PM, - ) -end - -cse!(pm) = LLVM.API.LLVMAddEarlyCSEPass(pm) -function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) - addr13NoAlias(mod) - if !LLVM.has_oldpm() - # TODO(NewPM) - return - end - # everying except unroll, slpvec, loop-vec - # then finish Julia GC - ModulePassManager() do pm - add_library_info!(pm, triple(mod)) - add_transform_info!(pm, tm) - - propagate_julia_addrsp_tm!(pm, tm) - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cfgsimplification!(pm) - dce!(pm) - cpu_features_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - mem_cpy_opt!(pm) - always_inliner!(pm) - alloc_opt_tm!(pm, tm) - LLVM.run!(pm, mod) - end - - # Globalopt is separated as it can delete functions, which invalidates the Julia hardcoded pointers to - # known functions - ModulePassManager() do pm - - add_library_info!(pm, triple(mod)) - add_transform_info!(pm, tm) - - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cpu_features_tm!(pm, tm) - - LLVM.API.LLVMAddGlobalOptimizerPass(pm) # Extra - gvn!(pm) # Extra - LLVM.run!(pm, mod) - end + run!(pb, mod, tm) - rewrite_generic_memory!(mod) - - ModulePassManager() do pm - add_library_info!(pm, triple(mod)) - add_transform_info!(pm, tm) - - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cpu_features_tm!(pm, tm) - - instruction_combining!(pm) - jl_inst_simplify!(pm) - cfgsimplification!(pm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - instruction_combining!(pm) - jl_inst_simplify!(pm) - jump_threading!(pm) - correlated_value_propagation!(pm) - instruction_combining!(pm) - jl_inst_simplify!(pm) - reassociate!(pm) - early_cse!(pm) - alloc_opt_tm!(pm, tm) - loop_idiom!(pm) - loop_rotate!(pm) - - loop_optimizations_tm!(pm, tm) - - instruction_combining!(pm) - jl_inst_simplify!(pm) - ind_var_simplify!(pm) - loop_deletion!(pm) - loop_unroll!(pm) - alloc_opt_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - gvn!(pm) - - # This InstCombine needs to be after GVN - # Otherwise it will generate load chains in GPU code... - instruction_combining!(pm) - jl_inst_simplify!(pm) - mem_cpy_opt!(pm) - sccp!(pm) - instruction_combining!(pm) - jl_inst_simplify!(pm) - jump_threading!(pm) - dead_store_elimination!(pm) - alloc_opt_tm!(pm, tm) - cfgsimplification!(pm) - loop_idiom!(pm) - loop_deletion!(pm) - jump_threading!(pm) - correlated_value_propagation!(pm) - # SLP_Vectorizer -- not for Enzyme - - LLVM.run!(pm, mod) - - aggressive_dce!(pm) - instruction_combining!(pm) - jl_inst_simplify!(pm) - # Loop Vectorize -- not for Enzyme - # InstCombine - - # GC passes - barrier_noop!(pm) - gc_invariant_verifier_tm!(pm, tm, false) - - # FIXME: Currently crashes printing - cfgsimplification!(pm) - instruction_combining!(pm) # Extra for Enzyme - jl_inst_simplify!(pm) - LLVM.run!(pm, mod) + # TODO: Turn into passes? + removeDeadArgs!(mod, tm) + detect_writeonly!(mod) + nodecayed_phis!(mod) end - - # Globalopt is separated as it can delete functions, which invalidates the Julia hardcoded pointers to - # known functions - ModulePassManager() do pm - add_library_info!(pm, triple(mod)) - add_transform_info!(pm, tm) - - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cpu_features_tm!(pm, tm) - - LLVM.API.LLVMAddGlobalOptimizerPass(pm) # Exxtra - gvn!(pm) # Exxtra - LLVM.run!(pm, mod) - end - removeDeadArgs!(mod, tm) - detect_writeonly!(mod) - nodecayed_phis!(mod) end function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) @@ -313,7 +156,6 @@ function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) # merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` # pass. - add!(fpm, AllocOptPass()) # consider AggressiveInstCombinePass at optlevel > 2 @@ -333,7 +175,23 @@ function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) # remove those before optimizing loops. add!(fpm, AllocOptPass()) - more_loop_optimizations_newPM!(fpm) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, LoopRotatePass()) + # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) + # add!(lpm, LoopIdiomPass()) TODO(NewPM): This seems to have gotten removed + + # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards + add!(lpm, LowerSIMDLoopPass()) # Annotate loop marked with "loopinfo" as LLVM parallel loop + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) + end + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, IndVarSimplifyPass()) + add!(lpm, LoopDeletionPass()) + end + add!(fpm, LoopUnrollPass(opt_level=2)) # Run our own SROA on heap objects before LLVM's add!(fpm, AllocOptPass()) @@ -364,7 +222,9 @@ function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) # to simplification and deletion # this helps significantly with cleaning up iteration add!(fpm, SimplifyCFGPass()) - add!(fpm, LoopDeletionPass()) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, LoopDeletionPass()) + end add!(fpm, InstCombinePass()) add!(fpm, JLInstSimplifyPass()) add!(fpm, LoopVectorizePass()) From a37fe6c4a057a5c464fdcf2271462a8084f5a26f Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 27 Oct 2025 18:58:21 +0100 Subject: [PATCH 16/22] loop idiom --- src/compiler/optimize.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 68cd62bb3a..9d6a439a92 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -68,8 +68,7 @@ function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) add!(fpm, EarlyCSEPass()) add!(fpm, AllocOptPass()) add!(fpm, NewPMLoopPassManager()) do lpm - # TODO(NewPM) - # loop idiom + add!(lpm, LoopIdiomRecognizePass()) add!(lpm, LoopRotatePass()) add!(lpm, LowerSIMDLoopPass()) add!(lpm, LICMPass()) @@ -102,9 +101,10 @@ function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) add!(fpm, SimplifyCFGPass()) - # TODO(NewPM) - # loop idiom - # loop deletion + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, LoopIdiomRecognizePass()) + add!(lpm, LoopDeletionPass()) + end add!(fpm, JumpThreadingPass()) add!(fpm, CorrelatedValuePropagationPass()) @@ -178,7 +178,7 @@ function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) add!(fpm, NewPMLoopPassManager()) do lpm add!(lpm, LoopRotatePass()) # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) - # add!(lpm, LoopIdiomPass()) TODO(NewPM): This seems to have gotten removed + add!(lpm, LoopIdiomRecognizePass()) # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards add!(lpm, LowerSIMDLoopPass()) # Annotate loop marked with "loopinfo" as LLVM parallel loop From 61181b954a30e818f6964010215b9a3b1d7e28ed Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 27 Oct 2025 19:01:35 +0100 Subject: [PATCH 17/22] turn rewrite_generic_memory! into a pass --- src/compiler/optimize.jl | 2 +- src/llvm/transforms.jl | 44 +++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 9d6a439a92..6c0baa3408 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -21,7 +21,7 @@ EnzymeAttributorPass() = NewPMModulePass("enzyme_attributor", enzyme_attributor_ ReinsertGCMarkerPass() = NewPMFunctionPass("reinsert_gcmarker", reinsert_gcmarker_pass!) SafeAtomicToRegularStorePass() = NewPMFunctionPass("safe_atomic_to_regular_store", safe_atomic_to_regular_store!) Addr13NoAliasPass() = NewPMModulePass("addr13_noalias", addr13NoAlias) -RewriteGenericMemoryPass() = NewPMModulePass("rewrite_generic_memory", rewrite_generic_memory) +RewriteGenericMemoryPass() = NewPMModulePass("rewrite_generic_memory", rewrite_generic_memory!) function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) @dispose pb = NewPMPassBuilder() begin diff --git a/src/llvm/transforms.jl b/src/llvm/transforms.jl index 4ab3344b95..f36a83e498 100644 --- a/src/llvm/transforms.jl +++ b/src/llvm/transforms.jl @@ -2369,31 +2369,33 @@ function checkNoAssumeFalse(mod::LLVM.Module, shouldshow::Bool = false) end function rewrite_generic_memory!(mod::LLVM.Module) -@static if VERSION < v"1.11-" -else - for f in functions(mod), bb in blocks(f) - iter = LLVM.API.LLVMGetFirstInstruction(bb) - while iter != C_NULL - inst = LLVM.Instruction(iter) - iter = LLVM.API.LLVMGetNextInstruction(iter) - if !isa(inst, LLVM.LoadInst) - continue - end - - if isa(operands(inst)[1], LLVM.ConstantExpr) + @static if VERSION < v"1.11-" + return false + else + for f in functions(mod), bb in blocks(f) + iter = LLVM.API.LLVMGetFirstInstruction(bb) + while iter != C_NULL + inst = LLVM.Instruction(iter) + iter = LLVM.API.LLVMGetNextInstruction(iter) + if !isa(inst, LLVM.LoadInst) + continue + end + + if isa(operands(inst)[1], LLVM.ConstantExpr) legal2, obj = absint(inst) if legal2 && obj isa Memory && obj == typeof(obj).instance - b = LLVM.IRBuilder() - position!(b, inst) - replace_uses!(inst, unsafe_to_llvm(b, obj)) - LLVM.API.LLVMInstructionEraseFromParent(inst) - continue - end - end - end + b = LLVM.IRBuilder() + position!(b, inst) + replace_uses!(inst, unsafe_to_llvm(b, obj)) + LLVM.API.LLVMInstructionEraseFromParent(inst) + continue + end + end + end + end + return true end end -end function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) # We need to run globalopt first. This is because remove dead args will otherwise From 9197d70c67409a9ff33272b147ed098317081b4a Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 27 Oct 2025 19:02:48 +0100 Subject: [PATCH 18/22] fixup! finish pipeline translation to NewPM --- src/compiler/optimize.jl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 6c0baa3408..798c9d0439 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -25,6 +25,7 @@ RewriteGenericMemoryPass() = NewPMModulePass("rewrite_generic_memory", rewrite_g function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) @dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) register!(pb, Addr13NoAliasPass()) register!(pb, RewriteGenericMemoryPass()) add!(pb, NewPMAAManager()) do aam @@ -34,15 +35,19 @@ function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) end add!(pb, NewPMModulePassManager()) do mpm add!(mpm, Addr13NoAliasPass()) - add!(mpm, PropagateJuliaAddrspacesPass()) add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, PropagateJuliaAddrspacesPass()) add!(fpm, SimplifyCFGPass()) add!(fpm, DCEPass()) - add!(fpm, CPUFeaturesPass()) + end + add!(mpm, CPUFeaturesPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, SROAPass()) add!(fpm, MemCpyOptPass()) - add!(fpm, AlwaysInlinerPass()) + end + add!(mpm, AlwaysInlinerPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm add!(fpm, AllocOptPass()) end From ad5b238a8a0c172ab87c96e2686ab0e749261801 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 28 Oct 2025 04:14:27 +0100 Subject: [PATCH 19/22] fixup pass --- src/llvm/transforms.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llvm/transforms.jl b/src/llvm/transforms.jl index f36a83e498..5462fd43ca 100644 --- a/src/llvm/transforms.jl +++ b/src/llvm/transforms.jl @@ -299,6 +299,7 @@ function addr13NoAlias(mod::LLVM.Module) end end end + return true end ## given code like From 31f8b761d11f741982d845bb86b1bb6d4de444fa Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 28 Oct 2025 13:38:58 +0100 Subject: [PATCH 20/22] LICM requires memory_ssa --- src/compiler/optimize.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 798c9d0439..1aef52ce6b 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -72,7 +72,7 @@ function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) add!(fpm, ReassociatePass()) add!(fpm, EarlyCSEPass()) add!(fpm, AllocOptPass()) - add!(fpm, NewPMLoopPassManager()) do lpm + add!(fpm, NewPMLoopPassManager(use_memory_ssa=true)) do lpm add!(lpm, LoopIdiomRecognizePass()) add!(lpm, LoopRotatePass()) add!(lpm, LowerSIMDLoopPass()) @@ -180,7 +180,7 @@ function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) # remove those before optimizing loops. add!(fpm, AllocOptPass()) - add!(fpm, NewPMLoopPassManager()) do lpm + add!(fpm, NewPMLoopPassManageruse(use_memory_ssa=true)) do lpm add!(lpm, LoopRotatePass()) # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) add!(lpm, LoopIdiomRecognizePass()) From 408d185bfbedf6d9d08db3c6c3f390e5c417ce20 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 28 Oct 2025 13:47:37 +0100 Subject: [PATCH 21/22] PreserveNVVM pass --- src/compiler.jl | 28 +++++++++++++++++++--------- src/compiler/optimize.jl | 2 ++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/compiler.jl b/src/compiler.jl index e6ea66c331..96b1c01c5d 100644 --- a/src/compiler.jl +++ b/src/compiler.jl @@ -1263,9 +1263,12 @@ function nested_codegen!( edges = edges::Vector{Any} push!(edges, funcspec) - LLVM.ModulePassManager() do pm - API.AddPreserveNVVMPass!(pm, true) #=Begin=# - LLVM.run!(pm, otherMod) + LLVM.@dispose pb=LLVM.NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + LLVM.add!(pb, LLVM.NewPMModulePassManager()) do mpm + LLVM.add!(mpm, PreserveNVVMPass()) + end + LLVM.run!(pb, mod) end if DumpPreNestedCheck[] @@ -4499,9 +4502,12 @@ function GPUCompiler.compile_unhooked(output::Symbol, job::CompilerJob{<:EnzymeT permit_inlining!(f) end - LLVM.ModulePassManager() do pm - API.AddPreserveNVVMPass!(pm, true) #=Begin=# - LLVM.run!(pm, mod) + LLVM.@dispose pb=LLVM.NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + LLVM.add!(pb, LLVM.NewPMModulePassManager()) do mpm + LLVM.add!(mpm, PreserveNVVMPass()) + end + LLVM.run!(pb, mod) end primalf = meta.entry @@ -5180,10 +5186,14 @@ end augmented_primalf = nothing end - LLVM.ModulePassManager() do pm - API.AddPreserveNVVMPass!(pm, false) #=Begin=# - LLVM.run!(pm, mod) + LLVM.@dispose pb=LLVM.NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + LLVM.add!(pb, LLVM.NewPMModulePassManager()) do mpm + LLVM.add!(mpm, PreserveNVVMEndPass()) + end + LLVM.run!(pb, mod) end + if !(primal_target isa GPUCompiler.NativeCompilerTarget) mark_gpu_intrinsics!(primal_target, mod) end diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 1aef52ce6b..af30481422 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -4,6 +4,8 @@ function registerEnzymeAndPassPipeline!(pb::NewPMPassBuilder) end LLVM.@function_pass "jl-inst-simplify" JLInstSimplifyPass +LLVM.@module_pass "preserve-nvvm" PreserveNVVMPass +LLVM.@module_pass "preserve-nvvm-end" PreserveNVVMEndPass const RunAttributor = Ref(true) From 7c06ed4f65be605507af14dedf172acc08bc4cc0 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 28 Oct 2025 13:49:55 +0100 Subject: [PATCH 22/22] fixup! LICM requires memory_ssa --- src/compiler/optimize.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index af30481422..a4f4334f2d 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -182,7 +182,7 @@ function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) # remove those before optimizing loops. add!(fpm, AllocOptPass()) - add!(fpm, NewPMLoopPassManageruse(use_memory_ssa=true)) do lpm + add!(fpm, NewPMLoopPassManager(use_memory_ssa=true)) do lpm add!(lpm, LoopRotatePass()) # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) add!(lpm, LoopIdiomRecognizePass())