-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reland "[AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU (#127212)" #131111
Conversation
We have a VALU->SGPR->SALU (VALU writing to SGPR and SALU reading from it). When VALU is issued, it increments internal counter VA_SDST used to track use of this SGPR. SALU will not issue until VA_SDST is zero, that is when VALU is finished writing. Therefore, delays added by s_delay_alu are not needed in this situation.
Updated test |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Ana Mihajlovic (mihajlovicana) ChangesWe have a VALU->SGPR->SALU (VALU writing to SGPR and SALU reading from it). When VALU is issued, it increments internal counter VA_SDST used to track use of this SGPR. SALU will not issue until VA_SDST is zero, that is when VALU is finished writing. Therefore, delays added by s_delay_alu are not needed in this situation. Patch is 521.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131111.diff 94 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index b25619b4c5422..51c4528e07d62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -47,6 +47,13 @@ class AMDGPUInsertDelayAlu {
return false;
}
+ static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
+ // These instruction types wait for VA_SDST==0 before issuing.
+ const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::SMRD;
+
+ return MI.getDesc().TSFlags & VA_SDST_0;
+ }
+
// Types of delay that can be encoded in an s_delay_alu instruction.
enum DelayType { VALU, TRANS, SALU, OTHER };
@@ -227,6 +234,16 @@ class AMDGPUInsertDelayAlu {
}
}
+ void advanceByVALUNum(unsigned VALUNum) {
+ iterator Next;
+ for (auto I = begin(), E = end(); I != E; I = Next) {
+ Next = std::next(I);
+ if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {
+ erase(I);
+ }
+ }
+ }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const TargetRegisterInfo *TRI) const {
if (empty()) {
@@ -331,6 +348,7 @@ class AMDGPUInsertDelayAlu {
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
+ MCRegUnit LastSGPRFromVALU = 0;
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
for (auto &MI : MBB.instrs()) {
@@ -345,6 +363,15 @@ class AMDGPUInsertDelayAlu {
DelayType Type = getDelayType(MI.getDesc().TSFlags);
+ if (instructionWaitsForSGPRWrites(MI)) {
+ auto It = State.find(LastSGPRFromVALU);
+ if (It != State.end()) {
+ DelayInfo Info = It->getSecond();
+ State.advanceByVALUNum(Info.VALUNum);
+ LastSGPRFromVALU = 0;
+ }
+ }
+
if (instructionWaitsForVALU(MI)) {
// Forget about all outstanding VALU delays.
// TODO: This is overkill since it also forgets about SALU delays.
@@ -368,6 +395,17 @@ class AMDGPUInsertDelayAlu {
}
}
}
+
+ if (SII->isVALU(MI.getOpcode())) {
+ for (const auto &Op : MI.defs()) {
+ Register Reg = Op.getReg();
+ if (AMDGPU::isSGPR(Reg, TRI)) {
+ LastSGPRFromVALU = *TRI->regunits(Reg).begin();
+ break;
+ }
+ }
+ }
+
if (Emit && !MI.isBundledWithPred()) {
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
// just ignore them?
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index af21a07a4c3a1..e2d179a77f76c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -2854,7 +2854,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: flat_store_b32 v[0:1], v3
; GFX12-NEXT: s_endpgm
@@ -3842,7 +3841,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 6e55d7fdb5e95..be894f2c76f67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -361,21 +361,21 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -385,21 +385,21 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv float %a, %b
@@ -2766,21 +2766,21 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2790,21 +2790,21 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
@@ -3981,21 +3981,21 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4005,21 +4005,21 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_constrhs0_dynamic:
@@ -4359,21 +4359,21 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400
; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4383,21 +4383,21 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400
; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_constlhs0_dynamic:
@@ -4732,21 +4732,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4756,21 +4756,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
@@ -5121,21 +5121,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: ...
[truncated]
|
…" (llvm#131111) We have a VALU->SGPR->SALU (VALU writing to SGPR and SALU reading from it). When VALU is issued, it increments internal counter VA_SDST used to track use of this SGPR. SALU will not issue until VA_SDST is zero, that is when VALU is finished writing. Therefore, delays added by s_delay_alu are not needed in this situation.
We have a VALU->SGPR->SALU (VALU writing to SGPR and SALU reading from it). When VALU is issued, it increments internal counter VA_SDST used to track use of this SGPR. SALU will not issue until VA_SDST is zero, that is when VALU is finished writing. Therefore, delays added by s_delay_alu are not needed in this situation.