From 9cb018f0b35a51d1a8cea0337d6ed701e0f65e18 Mon Sep 17 00:00:00 2001 From: Ana Mihajlovic Date: Wed, 12 Mar 2025 17:33:07 +0100 Subject: [PATCH] Reland "[AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU (#127212)" We have a VALU->SGPR->SALU (VALU writing to SGPR and SALU reading from it). When VALU is issued, it increments internal counter VA_SDST used to track use of this SGPR. SALU will not issue until VA_SDST is zero, that is when VALU is finished writing. Therefore, delays added by s_delay_alu are not needed in this situation. --- .../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 38 ++ .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 2 - .../CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll | 96 +-- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 8 +- .../AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll | 1 - llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 18 +- .../AMDGPU/atomic_optimizations_buffer.ll | 25 - .../atomic_optimizations_global_pointer.ll | 84 +-- .../atomic_optimizations_local_pointer.ll | 108 +-- .../atomic_optimizations_pixelshader.ll | 3 +- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 19 - .../atomic_optimizations_struct_buffer.ll | 19 - llvm/test/CodeGen/AMDGPU/bf16.ll | 6 +- llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 1 - .../buffer-fat-pointer-atomicrmw-fadd.ll | 41 +- .../buffer-fat-pointer-atomicrmw-fmax.ll | 46 +- .../buffer-fat-pointer-atomicrmw-fmin.ll | 46 +- .../AMDGPU/buffer-fat-pointers-memcpy.ll | 2 - .../test/CodeGen/AMDGPU/carryout-selection.ll | 13 +- .../CodeGen/AMDGPU/combine-add-zext-xor.ll | 16 +- .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 38 +- .../expand-scalar-carry-out-select-user.ll | 2 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 10 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 36 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 40 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 40 +- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 40 +- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 8 - llvm/test/CodeGen/AMDGPU/fma.f16.ll | 17 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 8 +- llvm/test/CodeGen/AMDGPU/fptrunc.ll | 10 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 7 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 36 +- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 40 +- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 40 +- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 40 +- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 4 +- .../AMDGPU/global_atomics_scan_fadd.ll | 20 +- .../AMDGPU/global_atomics_scan_fmax.ll | 15 +- .../AMDGPU/global_atomics_scan_fmin.ll | 15 +- .../AMDGPU/global_atomics_scan_fsub.ll | 20 +- llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll | 16 +- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 16 +- .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 12 +- llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir | 623 ++++++++++++------ .../insert_waitcnt_for_precise_memory.ll | 11 +- .../CodeGen/AMDGPU/integer-mad-patterns.ll | 30 +- ...e92561-restore-undef-scc-verifier-error.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll | 54 +- .../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll | 18 +- .../AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll | 6 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 64 +- .../AMDGPU/llvm.amdgcn.permlane.ptr.ll | 16 - ...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 2 - .../AMDGPU/llvm.amdgcn.readfirstlane.m0.ll | 1 - .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll | 1 - .../AMDGPU/llvm.amdgcn.s.ttracedata.ll | 1 - ....amdgcn.struct.buffer.load.format.v3f16.ll | 5 +- ...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 4 - ...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 4 - ...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 4 - ...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 9 +- ...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 9 +- ...gcn.struct.ptr.buffer.load.format.v3f16.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 15 +- .../AMDGPU/llvm.amdgcn.writelane.ptr.ll | 11 - llvm/test/CodeGen/AMDGPU/llvm.log.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 13 +- .../AMDGPU/load-constant-always-uniform.ll | 3 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 12 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 10 +- .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 2 - .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 2 - .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 2 - llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 8 +- llvm/test/CodeGen/AMDGPU/min.ll | 4 - ...uf-legalize-operands-non-ptr-intrinsics.ll | 20 +- .../CodeGen/AMDGPU/mubuf-legalize-operands.ll | 20 +- .../CodeGen/AMDGPU/no-dup-inst-prefetch.ll | 3 +- .../AMDGPU/promote-constOffset-to-imm.ll | 12 +- .../AMDGPU/pseudo-scalar-transcendental.ll | 4 +- llvm/test/CodeGen/AMDGPU/saddo.ll | 9 +- llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 2 - llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll | 5 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 6 +- ...r-descriptor-waterfall-loop-idom-update.ll | 2 +- 94 files changed, 920 insertions(+), 1299 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index b25619b4c5422..51c4528e07d62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -47,6 +47,13 @@ class AMDGPUInsertDelayAlu { return false; } + static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) { + // These instruction types wait for VA_SDST==0 before issuing. + const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::SMRD; + + return MI.getDesc().TSFlags & VA_SDST_0; + } + // Types of delay that can be encoded in an s_delay_alu instruction. enum DelayType { VALU, TRANS, SALU, OTHER }; @@ -227,6 +234,16 @@ class AMDGPUInsertDelayAlu { } } + void advanceByVALUNum(unsigned VALUNum) { + iterator Next; + for (auto I = begin(), E = end(); I != E; I = Next) { + Next = std::next(I); + if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) { + erase(I); + } + } + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump(const TargetRegisterInfo *TRI) const { if (empty()) { @@ -331,6 +348,7 @@ class AMDGPUInsertDelayAlu { bool Changed = false; MachineInstr *LastDelayAlu = nullptr; + MCRegUnit LastSGPRFromVALU = 0; // Iterate over the contents of bundles, but don't emit any instructions // inside a bundle. for (auto &MI : MBB.instrs()) { @@ -345,6 +363,15 @@ class AMDGPUInsertDelayAlu { DelayType Type = getDelayType(MI.getDesc().TSFlags); + if (instructionWaitsForSGPRWrites(MI)) { + auto It = State.find(LastSGPRFromVALU); + if (It != State.end()) { + DelayInfo Info = It->getSecond(); + State.advanceByVALUNum(Info.VALUNum); + LastSGPRFromVALU = 0; + } + } + if (instructionWaitsForVALU(MI)) { // Forget about all outstanding VALU delays. // TODO: This is overkill since it also forgets about SALU delays. @@ -368,6 +395,17 @@ class AMDGPUInsertDelayAlu { } } } + + if (SII->isVALU(MI.getOpcode())) { + for (const auto &Op : MI.defs()) { + Register Reg = Op.getReg(); + if (AMDGPU::isSGPR(Reg, TRI)) { + LastSGPRFromVALU = *TRI->regunits(Reg).begin(); + break; + } + } + } + if (Emit && !MI.isBundledWithPred()) { // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or // just ignore them? diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index af21a07a4c3a1..e2d179a77f76c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -2854,7 +2854,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: flat_store_b32 v[0:1], v3 ; GFX12-NEXT: s_endpgm @@ -3842,7 +3841,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 6e55d7fdb5e95..be894f2c76f67 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -361,21 +361,21 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 { ; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_denorm_mode 15 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff ; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -385,21 +385,21 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 { ; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff ; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float %a, %b @@ -2766,21 +2766,21 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 { ; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_denorm_mode 15 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff ; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2790,21 +2790,21 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 { ; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff ; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf: @@ -3981,21 +3981,21 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 { ; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 ; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-IEEE-NEXT: s_denorm_mode 15 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff ; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4005,21 +4005,21 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 { ; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 ; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff ; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic: @@ -4359,21 +4359,21 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 { ; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400 ; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 ; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-IEEE-NEXT: s_denorm_mode 15 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff ; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4383,21 +4383,21 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 { ; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 ; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff ; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 ; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_constlhs0_dynamic: @@ -4732,21 +4732,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_denorm_mode 15 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff ; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4756,21 +4756,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) # ; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff ; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_x: @@ -5121,21 +5121,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_denorm_mode 15 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff ; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -5145,21 +5145,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) # ; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2) -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff ; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 ; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_y: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index d66f50bf04770..809a3e0dd8ef5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -234,8 +234,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7] ; GFX11-NEXT: ; implicit-def: $vgpr18 @@ -360,8 +360,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16 ; GFX11-NEXT: ; implicit-def: $vgpr18 @@ -476,8 +476,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7] ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -604,8 +604,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16 ; GFX11-NEXT: ; implicit-def: $vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll index af50f56a87226..7d084582273d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll @@ -1468,7 +1468,6 @@ define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) { ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; encoding: [0x00,0x05,0x00,0x7e] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] ; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0 ; encoding: [0x01,0x10,0x00,0xb9] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index ba2af13338be6..bce06124f6db0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1072,12 +1072,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v2, v11 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2436,39 +2435,33 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 ; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 ; GFX12-NEXT: v_mov_b32_e32 v20, v22 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] ; GFX12-NEXT: v_mov_b32_e32 v19, v22 ; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 @@ -2490,7 +2483,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 ; GFX12-NEXT: v_mov_b32_e32 v14, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 @@ -2504,7 +2496,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 @@ -2521,10 +2512,9 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_wait_alu 0xf1fd ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index cd405fabf002d..8319e112f526e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -240,7 +240,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -454,7 +453,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] @@ -484,7 +482,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] @@ -517,7 +514,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -548,7 +544,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -777,7 +772,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 @@ -822,7 +816,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -864,7 +857,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 @@ -890,7 +882,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -910,7 +901,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -936,7 +926,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1178,7 +1167,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1226,7 +1214,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1270,7 +1257,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1299,7 +1285,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1319,7 +1304,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -1347,7 +1331,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1985,7 +1968,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -2018,7 +2000,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_endpgm @@ -2246,7 +2227,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2291,7 +2271,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2334,7 +2313,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2360,7 +2338,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2380,7 +2357,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -2407,7 +2383,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 3737cc414c58f..1a0c15e2b28ec 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -215,7 +215,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -249,7 +248,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -285,7 +283,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -318,7 +315,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -899,7 +895,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -930,7 +925,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -950,7 +944,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -979,7 +972,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -999,7 +991,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 ; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -1029,7 +1020,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1049,7 +1039,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 @@ -1078,7 +1067,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1407,12 +1395,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -1536,12 +1523,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -2159,12 +2145,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v3 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm @@ -2203,12 +2188,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm @@ -2246,7 +2230,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 @@ -2286,7 +2270,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 @@ -2576,17 +2560,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -2639,7 +2622,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 @@ -3260,7 +3242,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 ; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc @@ -3345,7 +3327,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 ; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo @@ -3402,7 +3384,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffd @@ -3458,7 +3439,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffd @@ -3546,7 +3526,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10 ; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffd @@ -4086,7 +4066,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4123,7 +4102,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mul_lo_u32 v0, s4, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4162,7 +4140,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4200,7 +4177,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mul_lo_u32 v0, s4, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4454,7 +4430,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 @@ -4485,7 +4460,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4505,7 +4479,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 @@ -4534,7 +4507,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4554,7 +4526,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 ; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 @@ -4584,7 +4555,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4604,7 +4574,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 @@ -4633,7 +4602,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4962,12 +4930,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -5091,12 +5058,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -5744,7 +5710,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5] ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v5, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -5789,7 +5754,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5] ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v5, vcc_lo ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -5833,7 +5797,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] ; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null @@ -5876,7 +5839,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] ; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null @@ -6164,17 +6126,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -6227,7 +6188,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 @@ -6848,7 +6808,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 ; GFX1164_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc @@ -6933,7 +6893,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 ; GFX1132_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo @@ -6990,7 +6950,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffd @@ -7046,7 +7005,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffd @@ -7134,7 +7092,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10 ; GFX1232_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffd @@ -7978,7 +7936,6 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_wait_alu 0xf1ff -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -8044,7 +8001,6 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_wait_alu 0xf1ff -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -9254,7 +9210,6 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_wait_alu 0xf1ff -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -9320,7 +9275,6 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_wait_alu 0xf1ff -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11308,7 +11262,6 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1264-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1264-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX1264-NEXT: s_wait_alu 0xfffd -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1264-NEXT: s_wait_alu 0xf1ff ; GFX1264-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] @@ -11365,7 +11318,6 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1232-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1232-NEXT: v_cmp_u_f32_e64 s0, v0, v0 ; GFX1232-NEXT: s_wait_alu 0xfffd -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX1232-NEXT: s_wait_alu 0xf1ff ; GFX1232-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index cd4e5a5730459..73cfdac8281a2 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -186,7 +186,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +214,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -407,7 +405,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 @@ -440,7 +437,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] ; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 ; GFX1132-NEXT: s_endpgm @@ -669,7 +665,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -694,7 +689,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -715,7 +709,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -737,7 +730,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -1215,7 +1207,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -1248,7 +1240,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 @@ -1911,12 +1903,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v3 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm @@ -1949,12 +1939,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm @@ -2217,17 +2205,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -2275,7 +2262,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -2751,7 +2737,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s3, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s4, v10, vcc @@ -2829,7 +2815,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v10 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo @@ -3019,11 +3005,10 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] @@ -3059,7 +3044,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -3829,7 +3813,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -3862,7 +3845,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX1132-NEXT: s_endpgm @@ -4091,7 +4073,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 @@ -4116,7 +4097,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -4137,7 +4117,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 @@ -4159,7 +4138,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,7 +4615,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -4670,7 +4648,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 @@ -5360,7 +5338,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v5, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm @@ -5399,7 +5376,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v5, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm @@ -5662,17 +5638,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -5720,7 +5695,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -6196,7 +6170,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s3, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s4, v10, vcc @@ -6274,7 +6248,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v10 ; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo @@ -6508,7 +6482,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 @@ -6533,7 +6506,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6554,7 +6526,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 @@ -6576,7 +6547,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -7873,7 +7843,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 @@ -7898,7 +7867,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -7919,7 +7887,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 @@ -7941,7 +7908,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9237,7 +9203,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 @@ -9262,7 +9227,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9283,7 +9247,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 @@ -9305,7 +9268,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -10601,7 +10563,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 @@ -10626,7 +10587,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -10647,7 +10607,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 @@ -10669,7 +10628,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11516,13 +11474,12 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -11574,13 +11531,12 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -12435,7 +12391,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 @@ -12460,7 +12415,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -12481,7 +12435,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 @@ -12503,7 +12456,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13350,13 +13302,12 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -13408,13 +13359,12 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -14269,7 +14219,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 @@ -14294,7 +14243,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -14315,7 +14263,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 @@ -14337,7 +14284,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -15173,14 +15119,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -15230,14 +15176,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -16088,7 +16034,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 @@ -16113,7 +16058,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16134,7 +16078,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 @@ -16156,7 +16099,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16993,14 +16935,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -17050,14 +16992,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 4ae08a0375c8c..0c624a83ae1be 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -576,12 +576,11 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: v_readlane_b32 s10, v1, 15 ; GFX1132-NEXT: s_mov_b32 exec_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s10, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 8c6224cc86284..6a82dbeec5e2f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -239,7 +239,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -453,7 +452,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] @@ -483,7 +481,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] @@ -516,7 +513,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -547,7 +543,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -776,7 +771,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 @@ -821,7 +815,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -863,7 +856,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 @@ -889,7 +881,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -909,7 +900,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -935,7 +925,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1573,7 +1562,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -1606,7 +1594,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_endpgm @@ -1834,7 +1821,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 @@ -1879,7 +1865,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 @@ -1922,7 +1907,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 @@ -1948,7 +1932,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1968,7 +1951,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -1995,7 +1977,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 63b46eba41225..dd4c0b0625ea8 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -247,7 +247,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -467,7 +466,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] @@ -498,7 +496,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] @@ -532,7 +529,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -563,7 +559,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -797,7 +792,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 @@ -843,7 +837,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -885,7 +878,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 @@ -912,7 +904,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -932,7 +923,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -958,7 +948,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1740,7 +1729,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -1773,7 +1761,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_endpgm @@ -2006,7 +1993,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2052,7 +2038,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2095,7 +2080,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2122,7 +2106,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2142,7 +2125,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 @@ -2169,7 +2151,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 669444e36079d..efcaa8807367b 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2375,17 +2375,17 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5] ; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5] ; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 ; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index c95ef4217ee5a..be96a39d7476d 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -264,7 +264,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX11-NEXT: s_cbranch_vccz .LBB2_1 ; GFX11-NEXT: ; %bb.3: ; %bb0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index a029164b7acd8..b66ee994ce7ee 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -383,10 +383,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -444,8 +442,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 glc @@ -2371,10 +2369,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -2403,10 +2399,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -2474,10 +2468,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: v_readfirstlane_b32 s6, v7 ; GFX11-NEXT: v_readfirstlane_b32 s7, v8 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -2505,8 +2498,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc @@ -4117,10 +4110,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -4153,10 +4144,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -4267,8 +4256,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -4300,8 +4289,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc @@ -5561,10 +5550,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -5608,10 +5595,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -5730,8 +5715,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -5774,8 +5759,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc @@ -6732,10 +6717,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -6794,8 +6777,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -6823,8 +6806,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc @@ -9041,10 +9024,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -9153,8 +9134,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -9201,8 +9182,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 38adf60888eca..cb557c62c206c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -375,10 +375,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -469,8 +467,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_max_f32 v5, v4, s[4:7], 0 offen offset:1024 glc @@ -1584,10 +1582,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -1618,10 +1614,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -1689,10 +1683,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: v_readfirstlane_b32 s6, v7 ; GFX11-NEXT: v_readfirstlane_b32 s7, v8 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -1722,8 +1715,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc @@ -3218,10 +3211,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -3257,10 +3248,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -3373,8 +3362,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -3409,8 +3398,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc @@ -4682,10 +4671,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -4729,10 +4716,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -4851,8 +4836,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -4895,8 +4880,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc @@ -6011,10 +5996,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -6045,10 +6028,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -6146,8 +6127,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -6177,8 +6158,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc @@ -7069,7 +7050,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 @@ -7464,10 +7444,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -7516,10 +7494,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -7634,8 +7610,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -7682,8 +7658,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 2b8cea9068d87..fea674a100b99 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -375,10 +375,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -469,8 +467,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_min_f32 v5, v4, s[4:7], 0 offen offset:1024 glc @@ -1584,10 +1582,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -1618,10 +1614,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -1689,10 +1683,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: v_readfirstlane_b32 s6, v7 ; GFX11-NEXT: v_readfirstlane_b32 s7, v8 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -1722,8 +1715,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc @@ -3218,10 +3211,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -3257,10 +3248,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -3373,8 +3362,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -3409,8 +3398,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc @@ -4682,10 +4671,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -4729,10 +4716,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -4851,8 +4836,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -4895,8 +4880,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc @@ -6011,10 +5996,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -6045,10 +6028,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -6146,8 +6127,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -6177,8 +6158,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc @@ -7069,7 +7050,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 @@ -7464,10 +7444,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -7516,10 +7494,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 @@ -7634,8 +7610,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 @@ -7682,8 +7658,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 8e023723ec25c..ffa9b465af0dd 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -923,7 +923,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr ; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 ; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0 ; SDAG-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 ; SDAG-GFX1100-NEXT: s_clause 0xf ; SDAG-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen @@ -1097,7 +1096,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0 ; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0 ; GISEL-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1 ; GISEL-GFX1100-NEXT: s_clause 0xf ; GISEL-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index cdea4fd158b04..aabcd69c88ca3 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -2759,7 +2759,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mul_i32 s7, s5, s0 ; GFX11-NEXT: s_mul_hi_u32 s13, s5, s1 ; GFX11-NEXT: s_mul_i32 s12, s6, s1 @@ -2781,12 +2780,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_u32 s1, s1, s7 ; GFX11-NEXT: s_addc_u32 s7, 0, s12 ; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s0, s0, s7 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-NEXT: s_mul_i32 s7, s5, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1 ; GFX11-NEXT: s_mul_i32 s6, s6, s1 ; GFX11-NEXT: s_add_i32 s7, s12, s7 @@ -2807,7 +2804,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_u32 s1, s1, s7 ; GFX11-NEXT: s_addc_u32 s5, 0, s5 ; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s0, s0, s5 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 @@ -2831,7 +2827,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_i32 s0, s0, s7 ; GFX11-NEXT: v_sub_co_u32 v0, s7, s10, s12 ; GFX11-NEXT: s_mul_i32 s6, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s0, s0, s6 ; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2 ; GFX11-NEXT: s_sub_i32 s6, s11, s0 @@ -2881,18 +2877,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: s_mul_i32 s1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX11-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NEXT: s_mul_hi_u32 s0, s10, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s1, s0, s2 ; GFX11-NEXT: s_add_i32 s3, s0, 1 ; GFX11-NEXT: s_sub_i32 s1, s10, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s4, s1, s2 ; GFX11-NEXT: s_cmp_ge_u32 s1, s2 ; GFX11-NEXT: s_cselect_b32 s0, s3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll index f8227f0039af7..77dfc859cd1b1 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -38,12 +38,12 @@ define i32 @combine_add_zext_xor() { ; GFX1100-NEXT: s_branch .LBB0_2 ; GFX1100-NEXT: .LBB0_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: s_xor_b32 s0, s0, -1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_cbranch_vccz .LBB0_4 ; GFX1100-NEXT: .LBB0_2: ; %.a @@ -118,12 +118,12 @@ define i32 @combine_sub_zext_xor() { ; GFX1100-NEXT: s_branch .LBB1_2 ; GFX1100-NEXT: .LBB1_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: s_xor_b32 s0, s0, -1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_cbranch_vccz .LBB1_4 ; GFX1100-NEXT: .LBB1_2: ; %.a @@ -365,11 +365,11 @@ define i32 @combine_add_zext_and() { ; GFX1100-NEXT: s_branch .LBB4_2 ; GFX1100-NEXT: .LBB4_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX1100-NEXT: s_cbranch_vccz .LBB4_4 ; GFX1100-NEXT: .LBB4_2: ; %.a @@ -444,11 +444,11 @@ define i32 @combine_sub_zext_and() { ; GFX1100-NEXT: s_branch .LBB5_2 ; GFX1100-NEXT: .LBB5_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_sub_nc_u32_e32 v1, v1, v0 ; GFX1100-NEXT: s_cbranch_vccz .LBB5_4 ; GFX1100-NEXT: .LBB5_2: ; %.a diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index d61c4b46596c0..64c887d570e54 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -320,7 +320,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB3_1 @@ -348,7 +347,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -441,7 +439,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 ; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB4_1 @@ -469,7 +466,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB4_1 @@ -556,7 +552,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB5_1 @@ -584,7 +579,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB5_1 @@ -744,7 +738,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s2, s2, s5 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2 @@ -799,7 +792,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_2 @@ -962,7 +954,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 ; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s4 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB7_2 @@ -1011,7 +1002,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 ; GFX11-GISEL-NEXT: s_bitset0_b32 s0, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s4 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_2 @@ -1132,7 +1122,6 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1 @@ -1162,7 +1151,6 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB8_1 @@ -1266,7 +1254,7 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 ; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 @@ -1300,7 +1288,7 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 ; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 @@ -1397,7 +1385,6 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1 @@ -1427,7 +1414,6 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB10_1 @@ -1525,7 +1511,6 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB11_1 @@ -1557,7 +1542,6 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB11_1 @@ -1670,7 +1654,6 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 ; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB12_1 @@ -1705,7 +1688,6 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB12_1 @@ -1805,7 +1787,6 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB13_1 @@ -1837,7 +1818,6 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB13_1 @@ -2048,7 +2028,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4 ; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 @@ -2069,7 +2048,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5 ; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_4 @@ -2094,7 +2072,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v1, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7 @@ -2137,7 +2114,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s5, v2, s4 ; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_2 @@ -2155,7 +2131,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s6, v1, s5 ; GFX11-GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s3, s3, s6 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_4 @@ -2179,7 +2154,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_7 @@ -2374,7 +2348,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, s3 ; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 @@ -2402,7 +2375,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 ; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_6 @@ -2444,7 +2416,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 ; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_2 @@ -2473,7 +2444,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 ; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_6 @@ -2585,7 +2555,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB16_1 @@ -2617,7 +2586,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB16_1 @@ -2711,7 +2679,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1 @@ -2741,7 +2708,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB17_1 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index 598cdddaa53d1..f3aec696abdee 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -66,10 +66,10 @@ define i32 @s_add_co_select_user() { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s1, s0, 0 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo ; GFX11-NEXT: s_cselect_b32 s1, s1, 0 ; GFX11-NEXT: s_cmp_gt_u32 s0, 31 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index fec04a27cda91..19deaf4a5535e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1691,27 +1691,25 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 ; GFX11-NEXT: v_readfirstlane_b32 s3, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s6, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_or_b32 s3, s5, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s5, s3, 0x1000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s7, s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s7, s5, s6 ; GFX11-NEXT: s_lshl_b32 s6, s7, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s6, s5 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-NEXT: s_or_b32 s5, s7, s5 ; GFX11-NEXT: s_lshl_b32 s6, s2, 12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s6, s3, s6 ; GFX11-NEXT: s_cmp_lt_i32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s6, s5, 7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_gt_i32 s6, 5 ; GFX11-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 0589b6abea26d..c713c48c92457 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -8414,13 +8414,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8713,13 +8712,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9290,13 +9288,12 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9578,13 +9575,12 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10303,13 +10299,12 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10605,13 +10600,12 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11242,14 +11236,13 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11597,14 +11590,13 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11953,14 +11945,13 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12297,14 +12288,13 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13535,14 +13525,13 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13893,14 +13882,13 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 5ae54926c4eab..71abe6f32e81e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6310,14 +6310,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6623,14 +6622,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7230,14 +7228,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7534,14 +7531,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8305,14 +8301,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8621,14 +8616,13 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9275,14 +9269,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9631,14 +9624,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10320,14 +10312,13 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10665,14 +10656,13 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11575,14 +11565,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11934,14 +11923,13 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15286,7 +15274,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -15622,7 +15609,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -15965,7 +15951,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16678,7 +16663,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 99aeb8fe1f80e..49c4b9000d8b5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6310,14 +6310,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6623,14 +6622,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7230,14 +7228,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7534,14 +7531,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8305,14 +8301,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8621,14 +8616,13 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9275,14 +9269,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9631,14 +9624,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10320,14 +10312,13 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10665,14 +10656,13 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11575,14 +11565,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11934,14 +11923,13 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15286,7 +15274,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -15622,7 +15609,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -15965,7 +15951,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16678,7 +16663,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index b0c76ecd30fbd..a6f8880d6d6f8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -6115,13 +6115,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6414,13 +6413,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6991,13 +6989,12 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7279,13 +7276,12 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8004,13 +8000,12 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8306,13 +8301,12 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8943,14 +8937,13 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9298,14 +9291,13 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9985,14 +9977,13 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10329,14 +10320,13 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11236,14 +11226,13 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11594,14 +11583,13 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -14801,7 +14789,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -15137,7 +15124,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -15480,7 +15466,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16193,7 +16178,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 0b6bdedeb48fc..07c9521e7646a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -15303,7 +15303,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -15465,7 +15464,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 @@ -15639,7 +15637,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -15807,7 +15804,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 @@ -15964,7 +15960,6 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -16120,7 +16115,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 @@ -16287,7 +16281,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -16449,7 +16442,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 8eda0072a7f4d..49d156788f66c 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -320,10 +320,9 @@ define i32 @test_D139469_f16(half %arg) { ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l ; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -335,10 +334,9 @@ define i32 @test_D139469_f16(half %arg) { ; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 ; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -372,7 +370,7 @@ define i32 @test_D139469_f16(half %arg) { ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 ; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 ; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe @@ -496,10 +494,9 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h ; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2 ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -519,10 +516,9 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 ; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2 ; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -569,12 +565,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 ; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 ; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: s_or_b32 s0, s1, s2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index cc11e256d5544..6a0d52962265d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -316,8 +316,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 ; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -368,8 +368,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 ; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -423,8 +423,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 ; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -578,8 +578,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 ; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 4bab6eaab6f7d..d1403b6c1a01d 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -418,26 +418,24 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 ; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 ; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 ; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index fcaf427f6c010..0935438f1b951 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -116,7 +116,6 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures( ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2252,7 +2251,6 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| ; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 ; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2389,7 +2387,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| ; GFX12-NEXT: v_floor_f16_e32 v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b16 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2570,14 +2567,14 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: v_fract_f16_e32 v6, v0 ; GFX12-NEXT: v_floor_f16_e32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_fract_f16_e32 v4, v3 ; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 ; GFX12-NEXT: v_floor_f16_e32 v7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 ; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 69f4cd4323d99..a466e9f6f6106 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -8557,13 +8557,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8908,13 +8907,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9586,13 +9584,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9924,13 +9921,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10775,13 +10771,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11129,13 +11124,12 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11866,14 +11860,13 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12273,14 +12266,13 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13061,14 +13053,13 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13455,14 +13446,13 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -14488,14 +14478,13 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -14898,14 +14887,13 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index e8d73914ad302..a1f5a0289172f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4780,14 +4780,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5145,14 +5144,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5853,14 +5851,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6207,14 +6204,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7104,14 +7100,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7472,14 +7467,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8227,14 +8221,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8636,14 +8629,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9428,14 +9420,13 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9824,14 +9815,13 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10863,14 +10853,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11275,14 +11264,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15230,7 +15218,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -15617,7 +15604,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16007,7 +15993,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16809,7 +16794,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index c1c92906df250..b026ed6250ce4 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4780,14 +4780,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5145,14 +5144,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5853,14 +5851,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6207,14 +6204,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7104,14 +7100,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7472,14 +7467,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8227,14 +8221,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8636,14 +8629,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9428,14 +9420,13 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9824,14 +9815,13 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10863,14 +10853,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11275,14 +11264,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15230,7 +15218,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -15617,7 +15604,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16007,7 +15993,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16809,7 +16794,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 36852816eaea1..fa619f97256bd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -5525,13 +5525,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5876,13 +5875,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6554,13 +6552,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6892,13 +6889,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7743,13 +7739,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8097,13 +8092,12 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8834,14 +8828,13 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9241,14 +9234,13 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10029,14 +10021,13 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10423,14 +10414,13 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11456,14 +11446,13 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11866,14 +11855,13 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15675,7 +15663,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16062,7 +16049,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -16452,7 +16438,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -17254,7 +17239,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index bb13934020f2c..94475e97b2e40 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4734,7 +4734,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc @@ -4854,7 +4854,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 4f5c46d5f424f..9d5b324b271cb 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -665,7 +665,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -714,7 +713,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1885,7 +1883,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1934,7 +1931,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3165,7 +3161,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3214,7 +3209,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3941,7 +3935,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3990,7 +3983,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5220,7 +5212,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5282,7 +5273,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -7226,7 +7216,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 @@ -8886,7 +8876,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 @@ -10319,7 +10309,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 @@ -11234,7 +11224,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 @@ -13282,7 +13272,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index e1ba4a2b0bf2a..aad38411d12e6 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -613,8 +613,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1652,8 +1652,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2691,8 +2691,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4597,11 +4597,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6120,11 +6119,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -8145,11 +8143,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 6b1d5253e178f..478daf363820f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -613,8 +613,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1652,8 +1652,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2691,8 +2691,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4597,11 +4597,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6120,11 +6119,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -8145,11 +8143,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index d575605f102b7..66f8c2de30530 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -725,7 +725,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -787,7 +786,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2057,7 +2055,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2119,7 +2116,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3389,7 +3385,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3451,7 +3446,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4217,7 +4211,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -4279,7 +4272,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5548,7 +5540,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5610,7 +5601,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -7554,7 +7544,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 @@ -9213,7 +9203,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 @@ -10646,7 +10636,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 @@ -11561,7 +11551,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 @@ -13608,7 +13598,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll index 59b0ba2469a20..a46d629c02b85 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll @@ -108,14 +108,13 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) { ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s1, s1, s0 ; GFX11-NEXT: s_bitset1_b32 s0, 22 ; GFX11-NEXT: s_addk_i32 s1, 0x7fff ; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 ; GFX11-NEXT: ; return to shader part epilog ; @@ -126,7 +125,6 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010 ; GFX12-NEXT: s_or_b32 s2, s0, 0x400000 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -307,11 +305,10 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s1, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_bfe_u32 s3, s0, 0x10010 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s3, s3, s0 ; GFX11-NEXT: s_bitset1_b32 s0, 22 ; GFX11-NEXT: s_addk_i32 s3, 0x7fff @@ -341,7 +338,6 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010 ; GFX12-NEXT: s_or_b32 s3, s0, 0x400000 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1165,14 +1161,13 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) { ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s1, s1, s0 ; GFX11-NEXT: s_bitset1_b32 s0, 22 ; GFX11-NEXT: s_addk_i32 s1, 0x7fff ; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_ashr_i32 s0, s0, 16 ; GFX11-NEXT: ; return to shader part epilog ; @@ -1183,7 +1178,6 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010 ; GFX12-NEXT: s_or_b32 s2, s0, 0x400000 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1364,11 +1358,10 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_bfe_u32 s3, s1, 0x10010 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s3, s3, s1 ; GFX11-NEXT: s_bitset1_b32 s1, 22 ; GFX11-NEXT: s_addk_i32 s3, 0x7fff @@ -1398,7 +1391,6 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010 ; GFX12-NEXT: s_or_b32 s3, s0, 0x400000 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 5fab0c50bbe57..0c5b8b096d910 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -112,11 +112,10 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mul_i32 s2, s2, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2 ; GFX11-NEXT: s_mov_b64 s[2:3], 0 ; GFX11-NEXT: s_add_i32 s8, s4, s5 @@ -272,11 +271,10 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mul_i32 s2, s2, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2 ; GFX11-NEXT: s_mov_b64 s[2:3], 0 ; GFX11-NEXT: s_add_i32 s8, s4, s5 @@ -430,11 +428,10 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mul_i32 s4, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_add_i32 s5, s5, s6 @@ -576,11 +573,10 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mul_i32 s3, s3, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_add_i32 s4, s4, s5 @@ -960,16 +956,15 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2 ; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v2|, |v0| ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s5, s5, exec_lo ; GFX11-NEXT: s_cselect_b32 s4, s4, 0 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1 ; GFX11-NEXT: s_add_i32 s3, s3, 1 ; GFX11-NEXT: v_mov_b32_e32 v3, s5 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] ; GFX11-NEXT: s_cbranch_scc0 .LBB6_1 @@ -1097,7 +1092,6 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2 ; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0| ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-NEXT: s_cselect_b32 s5, s5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index ea3d57d127151..44b1bb25bc057 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -118,14 +118,14 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, s30 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s0, s0, s30 ; GFX11-NEXT: s_mul_i32 s0, s0, s22 -; GFX11-NEXT: s_mul_i32 s0, s0, s20 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s0, s0, s20 ; GFX11-NEXT: s_or_b32 s0, s19, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 ; GFX11-NEXT: s_mov_b32 s0, s1 ; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] @@ -145,7 +145,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX11-NEXT: s_and_b32 s1, s8, s1 ; GFX11-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s19, v2 ; GFX11-NEXT: s_cselect_b32 s1, s19, s13 ; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 @@ -155,12 +155,12 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_and_b32 s20, s9, exec_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 ; GFX11-NEXT: v_readfirstlane_b32 s13, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s19, v2 ; GFX11-NEXT: s_cselect_b32 s13, s19, s13 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_bitcmp1_b32 s13, 0 ; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s13, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir index 5c9c0d1119163..c287fb3614496 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir @@ -1,14 +1,390 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s + +--- | + + define void @valu_dep_1() { + ; CHECK-LABEL: valu_dep_1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @valu_dep_2() { + ; CHECK-LABEL: valu_dep_2: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @valu_dep_3() { + ; CHECK-LABEL: valu_dep_3: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @valu_dep_4() { + ; CHECK-LABEL: valu_dep_4: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 + ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @valu_dep_5() { + ; CHECK-LABEL: valu_dep_5: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 + ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3 + ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @trans32_dep_1() { + ; CHECK-LABEL: trans32_dep_1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @trans32_dep_2() { + ; CHECK-LABEL: trans32_dep_2: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: v_exp_f32_e32 v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @trans32_dep_3() { + ; CHECK-LABEL: trans32_dep_3: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: v_exp_f32_e32 v1, v1 + ; CHECK-NEXT: v_exp_f32_e32 v2, v2 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @trans32_dep_4() { + ; CHECK-LABEL: trans32_dep_4: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: v_exp_f32_e32 v1, v1 + ; CHECK-NEXT: v_exp_f32_e32 v2, v2 + ; CHECK-NEXT: v_exp_f32_e32 v3, v3 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @salu_cycle_1() { + ; CHECK-LABEL: salu_cycle_1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: s_mov_b32 s0, 0 + ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 + ret void + } + + define void @salu_cycle_2() { + ; CHECK-LABEL: salu_cycle_2: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: s_mov_b32 s0, 0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 + ret void + } + + define void @valu_dep_1_same_trans32_dep_1() { + ; CHECK-LABEL: valu_dep_1_same_trans32_dep_1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_exp_f32_e32 v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 + ret void + } + + define void @trans32_dep_1_only() { + ; CHECK-LABEL: trans32_dep_1_only: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_exp_f32_e32 v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 + ret void + } + + define void @valu_dep_1_same_salu_cycle_1() { + ; CHECK-LABEL: valu_dep_1_same_salu_cycle_1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_mov_b32 s0, 0 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 + ret void + } + + define void @valu_dep_1_next_valu_dep_1() { + ; CHECK-LABEL: valu_dep_1_next_valu_dep_1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @valu_dep_2_next_valu_dep_2() { + ; CHECK-LABEL: valu_dep_2_next_valu_dep_2: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ret void + } + + define void @valu_dep_1_no_next_1() { + ; CHECK-LABEL: valu_dep_1_no_next_1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0 + ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0 + ret void + } + + define void @valu_dep_1_no_next_2() { + ; CHECK-LABEL: valu_dep_1_no_next_2: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 + ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) + ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 + ret void + } + + define void @implicit_cmp_cndmask() { + ; CHECK-LABEL: implicit_cmp_cndmask: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1 + ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc + ret void + } + + define void @explicit_cmp_cndmask() { + ; CHECK-LABEL: explicit_cmp_cndmask: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] + ret void + } + + define void @implicit_addc_addc() { + ; CHECK-LABEL: implicit_addc_addc: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc + ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc + ret void + } + + define void @explicit_addc_addc() { + ; CHECK-LABEL: explicit_addc_addc: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0 + ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc + ret void + } + + define void @valu_dep_3_bundle() { + ; CHECK-LABEL: valu_dep_3_bundle: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @if() { + ; CHECK-LABEL: if: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: s_cbranch_vccz .LBB23_2 + ; CHECK-NEXT: ; %bb.1: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: .LBB23_2: + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @else() { + ; CHECK-LABEL: else: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: s_cbranch_vccz .LBB24_2 + ; CHECK-NEXT: ; %bb.1: + ; CHECK-NEXT: s_branch .LBB24_3 + ; CHECK-NEXT: .LBB24_2: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: .LBB24_3: + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @if_else() { + ; CHECK-LABEL: if_else: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: s_cbranch_vccz .LBB25_2 + ; CHECK-NEXT: ; %bb.1: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_branch .LBB25_3 + ; CHECK-NEXT: .LBB25_2: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1 + ; CHECK-NEXT: .LBB25_3: + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @loop_1() { + ; CHECK-LABEL: loop_1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: .LBB26_1: ; =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0 + ; CHECK-NEXT: s_cbranch_vccz .LBB26_1 + ; CHECK-NEXT: ; %bb.2: + ret void + } + + define void @loop_2() { + ; CHECK-LABEL: loop_2: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ; CHECK-NEXT: s_cbranch_vccz .LBB27_1 + ; CHECK-NEXT: ; %bb.2: + ret void + } + + define void @sendmsg_rtn() { + ; CHECK-LABEL: sendmsg_rtn: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_mov_b32_e32 v0, 0 + ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) + ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) + ; CHECK-NEXT: s_add_u32 s0, s0, s0 + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @flat_load() { + ; CHECK-LABEL: flat_load: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_mov_b32_e32 v0, 0 + ; CHECK-NEXT: v_mov_b32_e32 v1, 0 + ; CHECK-NEXT: v_mov_b32_e32 v2, 0 + ; CHECK-NEXT: flat_load_b32 v0, v[0:1] + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2 + ret void + } + + define void @waitcnt_depctr() { + ; CHECK-LABEL: waitcnt_depctr: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_mov_b32_e32 v0, 0 + ; CHECK-NEXT: s_waitcnt_depctr 0xfff + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @writelane1() { + ; CHECK-LABEL: writelane1: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_writelane_b32 v0, s0, 0 + ; CHECK-NEXT: v_writelane_b32 v0, s0, 1 + ; CHECK-NEXT: v_writelane_b32 v0, s0, 2 + ; CHECK-NEXT: v_writelane_b32 v0, s0, 3 + ret void + } + + define void @writelane2() { + ; CHECK-LABEL: writelane2: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_writelane_b32 v0, s0, 3 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 + ret void + } + + define void @delay_alu() { + ; CHECK-LABEL: delay_alu: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1 + ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 + ; CHECK-NEXT: s_or_b32 s0, s0, s1 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 + ret void + } + + define void @redundant_delay_alu() { + ; CHECK-LABEL: redundant_delay_alu: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5 + ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 + ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s6, s7 + ; CHECK-NEXT: s_or_b32 s0, s0, s1 + ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 + ret void + } + + define void @redundant_delay_alu_2() { + ; CHECK-LABEL: redundant_delay_alu_2: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 + ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1 + ; CHECK-NEXT: s_or_b32 s0, s0, s1 + ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0 + ret void; + } +... + --- name: valu_dep_1 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_1: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec ... @@ -17,12 +393,6 @@ body: | name: valu_dep_2 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_2: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec @@ -32,13 +402,6 @@ body: | name: valu_dep_3 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_3: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 - ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec @@ -49,14 +412,6 @@ body: | name: valu_dep_4 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_4: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 - ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 - ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec @@ -70,14 +425,6 @@ body: | name: valu_dep_5 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_5: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 - ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 - ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3 - ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4 - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec @@ -90,11 +437,6 @@ body: | name: trans32_dep_1 body: | bb.0: - ; CHECK-LABEL: {{^}}trans32_dep_1: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_exp_f32_e32 v0, v0 - ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec ... @@ -103,12 +445,6 @@ body: | name: trans32_dep_2 body: | bb.0: - ; CHECK-LABEL: {{^}}trans32_dep_2: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_exp_f32_e32 v0, v0 - ; CHECK-NEXT: v_exp_f32_e32 v1, v1 - ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec @@ -118,13 +454,6 @@ body: | name: trans32_dep_3 body: | bb.0: - ; CHECK-LABEL: {{^}}trans32_dep_3: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_exp_f32_e32 v0, v0 - ; CHECK-NEXT: v_exp_f32_e32 v1, v1 - ; CHECK-NEXT: v_exp_f32_e32 v2, v2 - ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode @@ -137,13 +466,6 @@ body: | name: trans32_dep_4 body: | bb.0: - ; CHECK-LABEL: {{^}}trans32_dep_4: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_exp_f32_e32 v0, v0 - ; CHECK-NEXT: v_exp_f32_e32 v1, v1 - ; CHECK-NEXT: v_exp_f32_e32 v2, v2 - ; CHECK-NEXT: v_exp_f32_e32 v3, v3 - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode @@ -155,11 +477,6 @@ body: | name: salu_cycle_1 body: | bb.0: - ; CHECK-LABEL: {{^}}salu_cycle_1: - ; CHECK: %bb.0: - ; CHECK-NEXT: s_mov_b32 s0, 0 - ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 $sgpr0 = S_MOV_B32 0 $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec ... @@ -170,11 +487,6 @@ body: | name: salu_cycle_2 body: | bb.0: - ; CHECK-LABEL: {{^}}salu_cycle_2: - ; CHECK: %bb.0: - ; CHECK-NEXT: s_mov_b32 s0, 0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 - ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 $sgpr0 = S_MOV_B32 0 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec @@ -184,12 +496,6 @@ body: | name: valu_dep_1_same_trans32_dep_1 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_exp_f32_e32 v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 - ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec @@ -201,12 +507,6 @@ body: | name: trans32_dep_1_only body: | bb.0: - ; CHECK-LABEL: {{^}}trans32_dep_1_only: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_exp_f32_e32 v1, v1 - ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec @@ -216,12 +516,6 @@ body: | name: valu_dep_1_same_salu_cycle_1 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: s_mov_b32 s0, 0 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $sgpr0 = S_MOV_B32 0 $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec @@ -231,12 +525,6 @@ body: | name: valu_dep_1_next_valu_dep_1 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec @@ -246,13 +534,6 @@ body: | name: valu_dep_2_next_valu_dep_2 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec @@ -266,12 +547,6 @@ body: | name: valu_dep_1_no_next_1 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_1_no_next_1: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0 - ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0 $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode $vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode $vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode @@ -284,13 +559,6 @@ body: | name: valu_dep_1_no_next_2 body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_1_no_next_2: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 - ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1 - ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode @@ -304,10 +572,6 @@ body: | name: implicit_cmp_cndmask body: | bb.0: - ; CHECK-LABEL: {{^}}implicit_cmp_cndmask: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1 - ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec ... @@ -317,11 +581,6 @@ body: | name: explicit_cmp_cndmask body: | bb.0: - ; CHECK-LABEL: {{^}}explicit_cmp_cndmask: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] $sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec ... @@ -330,10 +589,6 @@ body: | name: implicit_addc_addc body: | bb.0: - ; CHECK-LABEL: {{^}}implicit_addc_addc: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc - ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc $vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec ... @@ -342,10 +597,6 @@ body: | name: explicit_addc_addc body: | bb.0: - ; CHECK-LABEL: {{^}}explicit_addc_addc: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0 - ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc $vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec ... @@ -354,13 +605,6 @@ body: | name: valu_dep_3_bundle body: | bb.0: - ; CHECK-LABEL: {{^}}valu_dep_3_bundle: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1 - ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec BUNDLE { $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec @@ -373,14 +617,6 @@ body: | name: if body: | bb.0: - ; CHECK-LABEL: {{^}}if: - ; CHECK: %bb.0: - ; CHECK-NEXT: s_cbranch_vccz .LBB23_2 - ; CHECK-NEXT: %bb.1: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: .LBB23_2: - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 S_CBRANCH_VCCZ %bb.2, implicit $vcc bb.1: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec @@ -392,16 +628,6 @@ body: | name: else body: | bb.0: - ; CHECK-LABEL: {{^}}else: - ; CHECK: %bb.0: - ; CHECK-NEXT: s_cbranch_vccz .LBB24_2 - ; CHECK-NEXT: %bb.1 - ; CHECK-NEXT: s_branch .LBB24_3 - ; CHECK-NEXT: .LBB24_2: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: .LBB24_3: - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 S_CBRANCH_VCCZ %bb.2, implicit $vcc bb.1: S_BRANCH %bb.3 @@ -415,18 +641,6 @@ body: | name: if_else body: | bb.0: - ; CHECK-LABEL: {{^}}if_else: - ; CHECK: %bb.0: - ; CHECK-NEXT: s_cbranch_vccz .LBB25_2 - ; CHECK-NEXT: %bb.1: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: s_branch .LBB25_3 - ; CHECK-NEXT: .LBB25_2: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1 - ; CHECK-NEXT: .LBB25_3: - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 S_CBRANCH_VCCZ %bb.2, implicit $vcc bb.1: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec @@ -443,13 +657,6 @@ body: | name: loop_1 body: | bb.0: - ; CHECK-LABEL: {{^}}loop_1: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: .LBB26_1: - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0 - ; CHECK-NEXT: s_cbranch_vccz .LBB26_1 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec bb.1: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec @@ -462,12 +669,6 @@ body: | name: loop_2 body: | bb.0: - ; CHECK-LABEL: {{^}}loop_2: - ; CHECK: %bb.0: - ; CHECK-NEXT: .LBB27_1: - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 - ; CHECK-NEXT: s_cbranch_vccz .LBB27_1 bb.1: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec S_CBRANCH_VCCZ %bb.1, implicit $vcc @@ -480,13 +681,6 @@ body: | name: sendmsg_rtn body: | bb.0: - ; CHECK-LABEL: {{^}}sendmsg_rtn: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_mov_b32_e32 v0, 0 - ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) - ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) - ; CHECK-NEXT: s_add_u32 s0, s0, s0 - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_SENDMSG_RTN_B32 128 $sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc @@ -499,13 +693,6 @@ body: | name: flat_load body: | bb.0: - ; CHECK-LABEL: {{^}}flat_load: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_mov_b32_e32 v0, 0 - ; CHECK-NEXT: v_mov_b32_e32 v1, 0 - ; CHECK-NEXT: v_mov_b32_e32 v2, 0 - ; CHECK-NEXT: flat_load_b32 v0, v[0:1] - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2 $vgpr0 = V_MOV_B32_e32 0, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec $vgpr2 = V_MOV_B32_e32 0, implicit $exec @@ -519,11 +706,6 @@ body: | name: waitcnt_depctr body: | bb.0: - ; CHECK-LABEL: {{^}}waitcnt_depctr: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_mov_b32_e32 v0, 0 - ; CHECK-NEXT: s_waitcnt_depctr 0xfff - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT_DEPCTR 4095 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec @@ -534,12 +716,6 @@ body: | name: writelane1 body: | bb.0: - ; CHECK-LABEL: {{^}}writelane1: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_writelane_b32 v0, s0, 0 - ; CHECK-NEXT: v_writelane_b32 v0, s0, 1 - ; CHECK-NEXT: v_writelane_b32 v0, s0, 2 - ; CHECK-NEXT: v_writelane_b32 v0, s0, 3 $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0 $vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0 $vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0 @@ -551,11 +727,38 @@ body: | name: writelane2 body: | bb.0: - ; CHECK-LABEL: {{^}}writelane2: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_writelane_b32 v0, s0, 3 - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) - ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0 $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec ... +# Check if s_delay_alu is added +--- +name: delay_alu +body: | + bb.0: + $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec + $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc + $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec +... +# Check if redundant delay_alu is removed +--- +name: redundant_delay_alu +body: | + bb.0: + $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec + $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + $sgpr6_sgpr7 = V_CMP_EQ_U32_e64 $sgpr6, $sgpr7, implicit $exec + $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc + $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec +... +# Check if redundant delay_alu is removed +--- +name: redundant_delay_alu_2 +body: | + bb.0: + $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec + $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc + $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec +... + diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 5256cbcef123a..efd4a0044c660 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -555,17 +555,16 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mul_i32 s5, s5, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s5 -; GFX11-NEXT: s_add_i32 s4, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s4, s4, s5 ; GFX11-NEXT: s_mul_hi_u32 s4, s2, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s5, s4, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s2, s2, s5 ; GFX11-NEXT: s_add_i32 s5, s4, 1 ; GFX11-NEXT: s_sub_i32 s6, s2, s3 @@ -590,12 +589,11 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe ; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) ; GFX12-NEXT: s_cvt_u32_f32 s4, s4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX12-NEXT: s_mul_i32 s5, s5, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5 @@ -1062,7 +1060,6 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index c0c0d3ded117d..dcb1d0e8c20a1 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5125,14 +5125,13 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v5, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v4, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v2, 0 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd @@ -5140,16 +5139,16 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 ; GFX1200-SDAG-NEXT: v_add3_u32 v4, v4, v6, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v3, v1 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v5, v4, v0 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4] -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v5, v1, v2 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v0, v4 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v4, v1, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1] +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -5164,19 +5163,18 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v6, v4, v2 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v7, v4, v2 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1] ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v4, v2 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v0, v5, vcc_lo -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v3, v[1:2] ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd @@ -5185,16 +5183,15 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v4 ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[3:4] ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v6, v[0:1] ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v5, v8 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v4, v[2:3] +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v5, v3, v[0:1] ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v8, v[2:3] ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -5978,19 +5975,18 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4 ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v13, v8, v5 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v14, v11, v6 ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v15, v10, v7 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v6, 0 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_add3_u32 v12, v1, v13, v12 ; GFX1200-SDAG-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_add3_u32 v13, v3, v15, v14 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v12, v9, vcc_lo ; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v2, v10 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd @@ -6040,7 +6036,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: v_add_co_u32 v9, s0, v2, 1 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v1, vcc_lo -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v8, v4 ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v9, v6 ; GFX1200-GISEL-NEXT: s_wait_alu 0xf1ff @@ -6078,7 +6073,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v8 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v9 ; GFX1200-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v7, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll index f961e857f39e5..2053ae970c773 100644 --- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll +++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll @@ -33,14 +33,12 @@ define void @issue92561(ptr addrspace(1) %arg) { ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; SDAG-NEXT: v_cmp_eq_u64_e64 s1, s[8:9], v[4:5] -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: v_cmp_eq_u64_e64 s2, s[10:11], v[6:7] ; SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: s_and_b32 s0, s0, s1 -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: s_and_b32 s0, s0, s2 -; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-NEXT: s_and_saveexec_b32 s0, s0 ; SDAG-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -108,14 +106,12 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[6:7] ; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[0:1] -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[2:3] ; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0 -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: s_and_b32 s0, s0, s1 -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: s_and_b32 s0, s0, s2 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_and_saveexec_b32 s0, s0 ; GISEL-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll index 840287b10bb49..3c1d70cfffabf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -132,25 +132,43 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { ; Divergent row number just causes a readfirstlane for now. define amdgpu_kernel void @id_row_i32() #0 { -; GFX11-LABEL: id_row_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-NEXT: s_mov_b32 m0, s0 -; GFX11-NEXT: exp pos0 v0, off, off, off done row_en -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: id_row_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: id_row_i32: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX12-NEXT: s_mov_b32 m0, s0 -; GFX12-NEXT: export pos0 v0, off, off, off done row_en -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: id_row_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX11-GISEL-NEXT: exp pos0 v1, off, off, off done row_en +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: id_row_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: export pos0 v0, off, off, off done row_en +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: id_row_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX12-GISEL-NEXT: export pos0 v1, off, off, off done row_en +; GFX12-GISEL-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 99, i32 undef, i32 undef, i32 undef, i1 true, i32 %id) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 110192ecefe55..1e2bf8256321d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -121,12 +121,11 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10 ; GISEL12-NEXT: ; %bb.2: ; %tail ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -150,7 +149,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4 -; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10 @@ -241,12 +240,11 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v12 ; GISEL12-NEXT: ; %bb.2: ; %tail ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -269,7 +267,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4 -; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v12 @@ -363,13 +361,12 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL12-NEXT: s_or_saveexec_b32 s8, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v1, s8 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s9, 0, v0 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s9 ; GISEL12-NEXT: s_mov_b32 exec_lo, s8 ; GISEL12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GISEL12-NEXT: v_mov_b32_e32 v11, v0 ; GISEL12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GISEL12-NEXT: s_wait_alu 0xfffe @@ -613,12 +610,11 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v13, 0x47, v12, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v13 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v13, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_dual_mov_b32 v11, v13 :: v_dual_add_nc_u32 v10, 42, v12 ; GISEL12-NEXT: ;;#ASMSTART ; GISEL12-NEXT: ; use v0-7 @@ -646,7 +642,7 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: v_cndmask_b32_e64 v13, 0x47, v12, s4 -; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v13 ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4 ; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v12 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll index e0a5d397bded4..baa904878310b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll @@ -26,11 +26,10 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s12 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GISEL12-NEXT: v_mov_b32_e32 v1, s13 ; GISEL12-NEXT: s_mov_b64 exec, s[10:11] ; GISEL12-NEXT: v_mov_b32_e32 v11, v0 @@ -61,12 +60,11 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1 ; DAGISEL12-NEXT: s_wait_alu 0xfffe ; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] -; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 ; DAGISEL12-NEXT: s_mov_b64 exec, s[10:11] ; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13 -; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13 ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 076cf09678b57..db557ff23c085 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -801,13 +801,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -834,14 +833,13 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -858,7 +856,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -979,13 +976,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -1012,14 +1008,13 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -1036,7 +1031,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -1173,7 +1167,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -1323,7 +1317,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -1476,7 +1470,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -1664,7 +1658,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -3366,13 +3360,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -3399,14 +3392,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -3423,7 +3415,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -3471,13 +3462,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -3504,14 +3494,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -3528,7 +3517,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -3738,7 +3726,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -3816,7 +3804,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -4041,7 +4029,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -4123,7 +4111,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -8571,7 +8559,6 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8610,7 +8597,6 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8649,7 +8635,6 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8688,7 +8673,6 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8727,7 +8711,6 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 % ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8766,7 +8749,6 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8805,7 +8787,6 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8844,7 +8825,6 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8906,7 +8886,6 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -8922,7 +8901,6 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -8985,7 +8963,6 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -9001,7 +8978,6 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -9090,7 +9066,6 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 @@ -9113,7 +9088,6 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9209,7 +9183,6 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 @@ -9232,7 +9205,6 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 @@ -9310,7 +9282,6 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -9328,7 +9299,6 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9401,7 +9371,6 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -9419,7 +9388,6 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll index 10c000095fe3d..8eab7e2fc62fa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll @@ -35,7 +35,6 @@ define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -77,7 +76,6 @@ define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 % ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -130,7 +128,6 @@ define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 @@ -189,7 +186,6 @@ define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 @@ -235,7 +231,6 @@ define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -274,7 +269,6 @@ define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -317,7 +311,6 @@ define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -362,7 +355,6 @@ define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 @@ -403,7 +395,6 @@ define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -442,7 +433,6 @@ define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -485,7 +475,6 @@ define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -530,7 +519,6 @@ define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 @@ -571,7 +559,6 @@ define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -610,7 +597,6 @@ define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -653,7 +639,6 @@ define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -698,7 +683,6 @@ define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 08d2201036c77..6c032ed061544 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -80,10 +80,8 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__ ; GFX12-NEXT: v_readfirstlane_b32 s3, v6 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll index 0ffee36d520dc..2fba9844fbcc1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll @@ -22,7 +22,6 @@ define void @test_readfirstlane_m0(i32 %arg) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 m0, s0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index f72f1e52d135f..deeceed3a19be 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -428,7 +428,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -449,7 +448,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -471,7 +469,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -492,7 +489,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -809,7 +805,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 ; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_max_u32 s6, s6, s8 @@ -847,7 +843,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 ; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8 @@ -884,7 +880,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6 @@ -922,7 +918,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 4551c60770bdf..434e761a5f8a2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -429,7 +429,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -450,7 +449,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -472,7 +470,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -493,7 +490,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -810,7 +806,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 ; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_min_u32 s6, s6, s8 @@ -848,7 +844,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 ; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8 @@ -885,7 +881,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6 @@ -923,7 +919,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll index cf86e2e1dedee..f2ee110c28c6f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll @@ -13,7 +13,6 @@ define void @test_s_sleep_var1(i32 %arg) { ; GCN-NEXT: s_wait_bvhcnt 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_sleep_var s0 ; GCN-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.sleep.var(i32 %arg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll index af792851e0ced..8aa8fac8b7985 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll @@ -29,7 +29,6 @@ define amdgpu_cs void @ttracedata_v(i32 %val) { ; GFX11-LABEL: ttracedata_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 m0, s0 ; GFX11-NEXT: s_ttracedata ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index fed7a8ec105fd..930aa6eeb62cc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -99,8 +99,8 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -123,11 +123,10 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index a2b9c869c9c9a..2c9f9a6ca4d55 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -49,10 +49,8 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, s0, s1 @@ -90,10 +88,8 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index c2a0028f4f1f1..a86ad8ede2f2c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -273,10 +273,8 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, s0, s1 @@ -412,10 +410,8 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 104462a506c8c..a3bdcbe17cc76 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -218,10 +218,8 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, s0, s1 @@ -328,10 +326,8 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index 13bb72a96142f..db6e0ad670747 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -428,8 +428,8 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc @@ -456,10 +456,8 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s1, s1 @@ -578,10 +576,9 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 @@ -613,10 +610,8 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: v_readfirstlane_b32 s3, v7 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index e75dd7409d51b..eef6bb7b0788f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -428,8 +428,8 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc @@ -456,10 +456,8 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s1, s1 @@ -578,10 +576,9 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 @@ -613,10 +610,8 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: v_readfirstlane_b32 s3, v7 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll index 46b2516f72f8e..9018160806925 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll @@ -97,8 +97,8 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 6646818b7b36f..04d179478590b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -936,7 +936,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX1100-SDAG-NEXT: s_mov_b32 s2, 0x40280000 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s3 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1963,7 +1963,6 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) { ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1999,7 +1998,6 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) { ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2041,7 +2039,6 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2077,7 +2074,6 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2119,7 +2115,6 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2155,7 +2150,6 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2197,7 +2191,6 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2233,7 +2226,6 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2275,7 +2267,6 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2311,7 +2302,6 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2358,7 +2348,6 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1 ; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off @@ -2400,7 +2389,6 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1 ; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1 ; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off @@ -2484,7 +2472,6 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v7 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v6 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s0, s1 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s4, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll index 40e124382df95..11cdc625d9adb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll @@ -40,7 +40,6 @@ define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1 ; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off @@ -119,7 +118,6 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v7 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v6 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 @@ -169,7 +167,6 @@ define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -221,10 +218,8 @@ define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> % ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 ; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -266,7 +261,6 @@ define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -318,10 +312,8 @@ define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> % ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 ; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -363,7 +355,6 @@ define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -415,10 +406,8 @@ define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> % ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 ; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 87a659de7c95f..4b2f703a26e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -585,7 +585,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s4 ; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo ; GFX1100-SDAG-NEXT: s_cselect_b32 s4, 32, 0 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: s_and_b32 s3, s3, exec_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index d7cefd6ed12ec..ff0cec118c169 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -585,7 +585,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x411a209b, s4 ; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo ; GFX1100-SDAG-NEXT: s_cselect_b32 s4, 32, 0 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: s_and_b32 s3, s3, exec_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index ebfc953a6bb96..9d24a67a1390a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -376,7 +376,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s4 ; GFX1100-SDAG-NEXT: s_and_b32 s4, s4, exec_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s5 @@ -701,7 +701,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 ; GFX1100-SDAG-NEXT: s_and_b32 s3, s3, exec_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index e828a12442fb8..76ca99059d58d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -936,13 +936,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 ; GFX11-NEXT: s_lshr_b32 s2, s1, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 9a2ef15737308..c472ee39a41e4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -749,13 +749,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 ; GFX11-NEXT: s_lshr_b32 s2, s1, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index ed00b4b685161..ab88c5fa36a12 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -110,17 +110,15 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add3_u32 v1, v1, v5, v7 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd @@ -288,22 +286,20 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0 ; GFX12-NEXT: v_mad_co_i64_i32 v[10:11], null, v5, v3, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v1, v6 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo ; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo @@ -315,13 +311,12 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll index b88266981a253..006da0cd18867 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll @@ -33,12 +33,11 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_or3_b32 v2, v2, s1, s2 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 1fc7349882ba1..a9240eff8e691 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -1385,8 +1385,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 ; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10005 @@ -1690,7 +1690,6 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: s_and_b32 s6, 0xffff, s2 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001 @@ -4904,13 +4903,13 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10002 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_and_b32 s2, s2, 1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5658,10 +5657,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-NEXT: v_mov_b32_e32 v7, v1 ; GFX12-NEXT: v_mov_b32_e32 v11, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_bfe_u32 v2, v4, 11, 1 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c @@ -6004,10 +6002,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: global_load_u16 v0, v32, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s3, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshr_b32 s4, s3, 15 ; GFX12-NEXT: s_lshr_b32 s2, s3, 14 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: s_lshr_b32 s6, s3, 12 ; GFX12-NEXT: s_lshr_b32 s8, s3, 13 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 65d9bc5b452b5..a6f8be8b7559d 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -7088,8 +7088,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -7238,10 +7238,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 ; GFX11-NEXT: s_lshl_b32 s7, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 @@ -7951,8 +7950,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -8095,10 +8094,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 ; GFX11-NEXT: s_lshl_b32 s7, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 681c07db327dc..a8f4f636949d8 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -6361,7 +6361,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 @@ -6726,7 +6725,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index bf56496e98690..9bd831fc2c130 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -6361,7 +6361,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 @@ -6726,7 +6725,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index c8848a4d89f10..31368af1c86df 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -7149,7 +7149,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 @@ -7514,7 +7513,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index d4f75051b04d4..263dc051737a5 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -382,7 +382,6 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v13, v14, v[8:9] ; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 ; GFX12-NEXT: s_wait_alu 0xfffd @@ -1158,7 +1157,7 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo @@ -1249,11 +1248,10 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX12-NEXT: s_wait_alu 0xfffd @@ -1798,11 +1796,9 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 { ; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index a6db7d331cef3..aaf81e2fa4000 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -3476,7 +3476,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, s4 ; GFX11-NEXT: s_cselect_b32 s3, s3, s5 @@ -3584,7 +3583,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, s4 ; GFX11-NEXT: s_cselect_b32 s3, s3, s5 @@ -3692,7 +3690,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, s4 ; GFX11-NEXT: s_cselect_b32 s3, s3, s5 @@ -3800,7 +3797,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, s4 ; GFX11-NEXT: s_cselect_b32 s3, s3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index e44803d611f84..8426224d9dd50 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -97,8 +97,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -123,8 +123,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -399,8 +399,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -418,8 +418,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 @@ -448,8 +448,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -467,8 +467,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 @@ -910,8 +910,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[10:11], v[2:3] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -936,8 +936,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 @@ -970,8 +970,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -996,8 +996,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 896cb6042e810..1480743e435ff 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -96,8 +96,8 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -122,8 +122,8 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -410,8 +410,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -429,8 +429,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 @@ -459,8 +459,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -478,8 +478,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 @@ -945,8 +945,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[10:11], v[2:3] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -971,8 +971,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] ; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] -; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 @@ -1005,8 +1005,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 @@ -1031,8 +1031,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] ; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] -; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll index d62f045674ace..f43ca4fdc1762 100644 --- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll @@ -92,10 +92,9 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: v_fma_f32 v1, v1, v0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_cmp_le_f32_e64 s0, 0, v1 ; GFX12-NEXT: s_and_b32 s0, s0, exec_lo -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b32 s2, s2, s0 ; GFX12-NEXT: s_branch .LBB0_1 ; GFX12-NEXT: .LBB0_4: ; %loop0_merge diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index ee89bf406c2a3..afb0ab958954c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -878,11 +878,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, s0, v7, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -892,27 +891,24 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v2, s0, v19, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, s0, v17, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v18, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, s0, v15, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, s0, v21, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v22, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, s0, v23, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, s0, v25, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v27, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v28, v3, vcc_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index e5e3ba6cdcaf0..733fe9317ddcc 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -213,7 +213,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000 ; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2 ; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -247,7 +247,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) { ; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2 ; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index b4eb775008122..a63d9f22236d5 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -106,11 +106,11 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX11-NEXT: s_addc_u32 s7, s3, s5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_xor_b32 s2, s4, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -439,8 +439,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] ; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_xor_b32 s4, s6, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -560,10 +560,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] ; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 94b22b79f6632..0b68a0534fa08 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -452,10 +452,9 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -482,10 +481,9 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 434f266e91d25..ecbb2da5242bd 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1349,7 +1349,6 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1 ; GFX11-LABEL: no_skip_no_successors: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX11-NEXT: s_cbranch_vccz .LBB12_3 ; GFX11-NEXT: ; %bb.1: ; %bb6 @@ -1361,7 +1360,6 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1 ; GFX11-NEXT: s_mov_b64 exec, 0 ; GFX11-NEXT: .LBB12_3: ; %bb3 ; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX11-NEXT: ; %bb.4: ; %bb5 ; GFX11-NEXT: .LBB12_5: diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index 2a2fd93bc2d0b..eb1b844ad8938 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -452,10 +452,9 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -482,10 +481,9 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll index 5b40d53e0a81c..bff5c6c0db365 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @icmp_test() { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_cmp_eq_u16_e64 s[0:1], 0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: ds_store_b32 v1, v0 ; CHECK-NEXT: s_endpgm @@ -27,11 +27,10 @@ define amdgpu_kernel void @fcmp_test(half %x, half %y) { ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: v_cmp_le_f16_e64 s[0:1], s0, s1 ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: ds_store_b32 v1, v0 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index 5360ff2fa402f..a3f632267ccd6 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -1625,10 +1625,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v1, s[8:9] ; GFX11-NEXT: s_endpgm @@ -1648,10 +1647,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX12-NEXT: global_store_b8 v0, v1, s[8:9] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index 0211c5111c31d..e30c8a53b0571 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -57,8 +57,8 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_store_b32 v0, v0, s[4:7], 0 offen ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5