From c8f4c35a6624c23632fbca7f5f384655ef4811f0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 5 Mar 2025 01:06:11 +0700 Subject: [PATCH] AMDGPU: Correctly handle folding immediates into subregister use operands (#129664) This fixes a miscompile where a 64-bit materialize incorrectly folds into a sub1 use operand. We currently do not see many subregister use operands. Incidentally, there are also SIFoldOperands bugs that prevent this fold from appearing here. Pre-fix folding of 32-bit subregister uses from 64-bit materializes, in preparation for future patches. The existing APIs are awkward since they expect to have a fully formed instruction with operands to use, and not something new which needs to be created. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 65 +++++++++++++++---- .../AMDGPU/si-fold-operands-subreg-imm.mir | 8 +-- 2 files changed, 57 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3a3f303293461..2e66796bcb6bc 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -50,6 +50,11 @@ struct FoldCandidate { } } + FoldCandidate(MachineInstr *MI, unsigned OpNo, int64_t FoldImm, + bool Commuted_ = false, int ShrinkOp = -1) + : UseMI(MI), ImmToFold(FoldImm), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), + Kind(MachineOperand::MO_Immediate), Commuted(Commuted_) {} + bool isFI() const { return Kind == MachineOperand::MO_FrameIndex; } @@ -578,16 +583,29 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { } static void appendFoldCandidate(SmallVectorImpl &FoldList, - MachineInstr *MI, unsigned OpNo, - MachineOperand *FoldOp, bool Commuted = false, - int ShrinkOp = -1) { + FoldCandidate &&Entry) { // Skip additional folding on the same operand. for (FoldCandidate &Fold : FoldList) - if (Fold.UseMI == MI && Fold.UseOpNo == OpNo) + if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo) return; - LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal") - << " operand " << OpNo << "\n " << *MI); - FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp); + LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal") + << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI); + FoldList.push_back(Entry); +} + +static void appendFoldCandidate(SmallVectorImpl &FoldList, + MachineInstr *MI, unsigned OpNo, + MachineOperand *FoldOp, bool Commuted = false, + int ShrinkOp = -1) { + appendFoldCandidate(FoldList, + FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); +} + +static void appendFoldCandidate(SmallVectorImpl &FoldList, + MachineInstr *MI, unsigned OpNo, int64_t ImmVal, + bool Commuted = false, int ShrinkOp = -1) { + appendFoldCandidate(FoldList, + FoldCandidate(MI, OpNo, ImmVal, Commuted, ShrinkOp)); } bool SIFoldOperandsImpl::tryAddToFoldList( @@ -847,11 +865,35 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; - if (OpToFold.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { - appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold); - return true; + MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + if (OpToFold.isImm()) { + if (unsigned UseSubReg = UseOp.getSubReg()) { + std::optional SubImm = + SIInstrInfo::extractSubregFromImm(OpToFold.getImm(), UseSubReg); + if (!SubImm) + return false; + + // TODO: Avoid the temporary MachineOperand + MachineOperand TmpOp = MachineOperand::CreateImm(*SubImm); + if (TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp)) { + appendFoldCandidate(FoldList, UseMI, UseOpIdx, *SubImm); + return true; + } + + return false; + } + + if (TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { + appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold); + return true; + } } + // TODO: Verify the following code handles subregisters correctly. + // TODO: Handle extract of global reference + if (UseOp.getSubReg()) + return false; + if (!OpToFold.isReg()) return false; @@ -861,8 +903,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( // Maybe it is just a COPY of an immediate itself. MachineInstr *Def = MRI->getVRegDef(UseReg); - MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); - if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) { + if (Def && TII->isFoldableCopy(*Def)) { MachineOperand &DefOp = Def->getOperand(1); if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { appendFoldCandidate(FoldList, UseMI, UseOpIdx, &DefOp); diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir index 591bda2b22f12..2389477bb72a8 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir @@ -17,7 +17,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY]].sub1, %subreg.sub1 ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[REG_SEQUENCE]].sub0, 8, implicit-def $scc - ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[REG_SEQUENCE]].sub1, 8, implicit-def $scc, implicit $scc + ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[REG_SEQUENCE]].sub1, 0, implicit-def $scc, implicit $scc ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]] %0:sgpr_64 = COPY $sgpr8_sgpr9 @@ -42,8 +42,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr8_vgpr9 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY]].sub1, %subreg.sub1 - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, 30064771075, 0, implicit $exec - ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, 30064771075, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, 3, 0, implicit $exec + ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, 7, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]] %0:vreg_64 = COPY $vgpr8_vgpr9 @@ -116,7 +116,7 @@ body: | ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B64_]].sub1, %subreg.sub0, [[S_MOV_B64_]].sub0, %subreg.sub1 ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 8, implicit-def $scc - ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 8, implicit-def $scc, implicit $scc + ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 0, implicit-def $scc, implicit $scc ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_ADD_U32_]], implicit [[S_ADDC_U32_]] %0:sgpr_64 = COPY $sgpr8_sgpr9 %1:sreg_64 = S_MOV_B64 8