-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GlobalISel: add RegBankLegalize rules for bitfield extract #132381
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,9 +15,13 @@ | |
#include "AMDGPUGlobalISelUtils.h" | ||
#include "AMDGPUInstrInfo.h" | ||
#include "AMDGPURegisterBankInfo.h" | ||
#include "GCNSubtarget.h" | ||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h" | ||
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" | ||
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||
#include "llvm/CodeGen/MachineUniformityAnalysis.h" | ||
#include "llvm/IR/IntrinsicsAMDGPU.h" | ||
#include "llvm/Support/ErrorHandling.h" | ||
|
||
#define DEBUG_TYPE "amdgpu-regbanklegalize" | ||
|
||
|
@@ -127,6 +131,117 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, | |
MI.eraseFromParent(); | ||
} | ||
|
||
bool isSignedBFE(MachineInstr &MI) { | ||
unsigned Opc = | ||
isa<GIntrinsic>(MI) ? MI.getOperand(1).getIntrinsicID() : MI.getOpcode(); | ||
Comment on lines
+135
to
+136
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather just handle these as two separate cases instead of incorrectly merge 2 separate enum spaces There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, it should be two separate switches |
||
|
||
switch (Opc) { | ||
case AMDGPU::G_SBFX: | ||
case Intrinsic::amdgcn_sbfe: | ||
return true; | ||
case AMDGPU::G_UBFX: | ||
case Intrinsic::amdgcn_ubfe: | ||
return false; | ||
default: | ||
llvm_unreachable("Opcode not supported"); | ||
} | ||
} | ||
|
||
void RegBankLegalizeHelper::lowerDiv_BFE(MachineInstr &MI) { | ||
Register Dst = MI.getOperand(0).getReg(); | ||
assert(MRI.getType(Dst) == LLT::scalar(64)); | ||
bool Signed = isSignedBFE(MI); | ||
unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; | ||
// Extract bitfield from Src, LSBit is the least-significant bit for the | ||
// extraction (field offset) and Width is size of bitfield. | ||
Register Src = MI.getOperand(FirstOpnd).getReg(); | ||
Register LSBit = MI.getOperand(FirstOpnd + 1).getReg(); | ||
Register Width = MI.getOperand(FirstOpnd + 2).getReg(); | ||
// Comments are for signed bitfield extract, similar for unsigned. x is sign | ||
// bit. s is sign, l is LSB and y are remaining bits of bitfield to extract. | ||
|
||
// Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl | ||
unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR; | ||
auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit}); | ||
|
||
auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI); | ||
|
||
// Expand to Src >> LSBit << (64 - Width) >> (64 - Width) | ||
// << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000 | ||
// >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl | ||
if (!ConstWidth) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this functionally the same as the intrinsic case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume so, looks like intrinsics were introduced first, then the G_ opcodes |
||
auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width); | ||
auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt); | ||
B.buildInstr(SHROpc, {Dst}, {SignBit, Amt}); | ||
MI.eraseFromParent(); | ||
return; | ||
} | ||
|
||
auto WidthImm = ConstWidth->Value.getZExtValue(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no auto |
||
auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc); | ||
Register SHRSrcLo = UnmergeSHRSrc.getReg(0); | ||
Register SHRSrcHi = UnmergeSHRSrc.getReg(1); | ||
auto Zero = B.buildConstant({VgprRB, S32}, 0); | ||
unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX; | ||
|
||
if (WidthImm <= 32) { | ||
// SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl | ||
auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width}); | ||
MachineInstrBuilder Hi; | ||
if (Signed) { | ||
// SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl | ||
Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31)); | ||
} else { | ||
// SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl | ||
Hi = Zero; | ||
} | ||
B.buildMergeLikeInstr(Dst, {Lo, Hi}); | ||
} else { | ||
auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32); | ||
// SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl | ||
auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt}); | ||
B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi}); | ||
} | ||
|
||
MI.eraseFromParent(); | ||
return; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. useless return |
||
} | ||
|
||
void RegBankLegalizeHelper::lowerUni_BFE(MachineInstr &MI) { | ||
Register DstReg = MI.getOperand(0).getReg(); | ||
LLT Ty = MRI.getType(DstReg); | ||
bool Signed = isSignedBFE(MI); | ||
unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; | ||
Register Src = MI.getOperand(FirstOpnd).getReg(); | ||
Register LSBit = MI.getOperand(FirstOpnd + 1).getReg(); | ||
Register Width = MI.getOperand(FirstOpnd + 2).getReg(); | ||
// For uniform bit field extract there are 4 available instructions, but | ||
// LSBit(field offset) and Width(size of bitfield) need to be packed in S32, | ||
// field offset in low and size in high 16 bits. | ||
|
||
// Src1 Hi16|Lo16 = Size|FieldOffset | ||
auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6)); | ||
auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask); | ||
Comment on lines
+223
to
+224
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there's a buildZextInReg helper |
||
auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16)); | ||
auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size); | ||
unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; | ||
unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; | ||
Comment on lines
+227
to
+228
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm worried about this harming the ability of downstream transforms and analyses to do anything with the operation. We should still be able to handle known and demanded bits. Also don't want to break any patterns that read BFE inputs. Except for the intrinsic with variable inputs case, can't you just use the G_* opcodes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are no tablegen patterns for S_BFE, known bits for GlobalISel are missing support for many generic opcodes not to mention target intrinsics. Can we leave that for another patch? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Since the two operands are merged into one operand, I think you need to manually select it. Should add a fixme to stop directly selecting it |
||
unsigned Opc = Ty == S32 ? Opc32 : Opc64; | ||
|
||
// Select machine instruction, because of reg class constraining, insert | ||
// copies from reg class to reg bank. | ||
auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}}, | ||
{B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)}); | ||
const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd expect this to be a member already |
||
if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(), | ||
*ST.getRegisterInfo(), RBI)) | ||
llvm_unreachable("failed to constrain BFE"); | ||
|
||
B.buildCopy(DstReg, S_BFE->getOperand(0).getReg()); | ||
MI.eraseFromParent(); | ||
return; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. useless return |
||
} | ||
|
||
void RegBankLegalizeHelper::lower(MachineInstr &MI, | ||
const RegBankLLTMapping &Mapping, | ||
SmallSet<Register, 4> &WaterfallSgprs) { | ||
|
@@ -225,6 +340,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, | |
MI.eraseFromParent(); | ||
break; | ||
} | ||
case Div_BFE: | ||
return lowerDiv_BFE(MI); | ||
case Uni_BFE: | ||
return lowerUni_BFE(MI); | ||
case SplitLoad: { | ||
LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); | ||
unsigned Size = DstTy.getSizeInBits(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this include needed?