Skip to content

Commit 9a0eaa1

Browse files
AMDGPU/GlobalISel: add RegBankLegalize rules for bit shifts and sext-inreg
Uniform S16 shifts have to be extended to S32 using appropriate Extend before lowering to S32 instruction. Uniform packed V2S16 are lowered to SGPR S32 instructions, other option is to use VALU packed V2S16 and ReadAnyLane. For uniform S32 and S64 and divergent S16, S32, S64 and V2S16 there are instructions available.
1 parent 6560c53 commit 9a0eaa1

13 files changed

+311
-151
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "GCNSubtarget.h"
2424
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2525
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26+
#include "llvm/CodeGen/GlobalISel/Utils.h"
2627
#include "llvm/CodeGen/MachineFunctionPass.h"
2728
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2829
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -306,7 +307,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
306307
// Opcodes that support pretty much all combinations of reg banks and LLTs
307308
// (except S1). There is no point in writing rules for them.
308309
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
309-
Opc == AMDGPU::G_MERGE_VALUES) {
310+
Opc == AMDGPU::G_MERGE_VALUES || Opc == G_BITCAST) {
310311
RBLHelper.applyMappingTrivial(*MI);
311312
continue;
312313
}

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

+104
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@
1414
#include "AMDGPURegBankLegalizeHelper.h"
1515
#include "AMDGPUGlobalISelUtils.h"
1616
#include "AMDGPUInstrInfo.h"
17+
#include "AMDGPURegBankLegalizeRules.h"
1718
#include "AMDGPURegisterBankInfo.h"
1819
#include "GCNSubtarget.h"
1920
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
2021
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
22+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
2123
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
2224
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2325
#include "llvm/IR/IntrinsicsAMDGPU.h"
26+
#include "llvm/Support/ErrorHandling.h"
2427

2528
#define DEBUG_TYPE "amdgpu-regbanklegalize"
2629

@@ -130,6 +133,28 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
130133
MI.eraseFromParent();
131134
}
132135

136+
std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
137+
auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
138+
auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
139+
auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
140+
auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
141+
return {Lo.getReg(0), Hi.getReg(0)};
142+
}
143+
144+
std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
145+
auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
146+
auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
147+
auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
148+
return {Lo.getReg(0), Hi.getReg(0)};
149+
}
150+
151+
std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
152+
auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
153+
auto Lo = PackedS32;
154+
auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
155+
return {Lo.getReg(0), Hi.getReg(0)};
156+
}
157+
133158
void RegBankLegalizeHelper::lower(MachineInstr &MI,
134159
const RegBankLLTMapping &Mapping,
135160
SmallSet<Register, 4> &WaterfallSgprs) {
@@ -259,6 +284,33 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
259284
MI.eraseFromParent();
260285
break;
261286
}
287+
case SExtInRegSplitTo32: {
288+
auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
289+
int Amt = MI.getOperand(2).getImm();
290+
Register Lo, Hi;
291+
// Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
292+
if (Amt <= 32) {
293+
auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
294+
if (Amt == 32) {
295+
// Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
296+
Lo = Freeze.getReg(0);
297+
} else {
298+
// Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
299+
Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
300+
}
301+
302+
auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
303+
Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
304+
} else {
305+
// Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
306+
Lo = Op1.getReg(0);
307+
Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
308+
}
309+
310+
B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
311+
MI.eraseFromParent();
312+
break;
313+
}
262314
case Div_BFE: {
263315
Register Dst = MI.getOperand(0).getReg();
264316
assert(MRI.getType(Dst) == LLT::scalar(64));
@@ -356,6 +408,37 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
356408
MI.eraseFromParent();
357409
return;
358410
}
411+
case Unpack: {
412+
Register Lo, Hi;
413+
switch (MI.getOpcode()) {
414+
case AMDGPU::G_SHL: {
415+
auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
416+
auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
417+
Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
418+
Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
419+
break;
420+
}
421+
case AMDGPU::G_LSHR: {
422+
auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
423+
auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
424+
Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
425+
Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
426+
break;
427+
}
428+
case AMDGPU::G_ASHR: {
429+
auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
430+
auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
431+
Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
432+
Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
433+
break;
434+
}
435+
default:
436+
llvm_unreachable("Unpack lowering not implemented");
437+
}
438+
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
439+
MI.eraseFromParent();
440+
return;
441+
}
359442
case SplitLoad: {
360443
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
361444
unsigned Size = DstTy.getSizeInBits();
@@ -445,6 +528,13 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
445528
case SgprP5:
446529
case VgprP5:
447530
return LLT::pointer(5, 32);
531+
case SgprV2S16:
532+
case VgprV2S16:
533+
case UniInVgprV2S16:
534+
return LLT::fixed_vector(2, 16);
535+
case SgprV2S32:
536+
case VgprV2S32:
537+
return LLT::fixed_vector(2, 32);
448538
case SgprV4S32:
449539
case VgprV4S32:
450540
case UniInVgprV4S32:
@@ -518,6 +608,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
518608
case SgprP3:
519609
case SgprP4:
520610
case SgprP5:
611+
case SgprV2S16:
612+
case SgprV2S32:
521613
case SgprV4S32:
522614
case SgprB32:
523615
case SgprB64:
@@ -527,6 +619,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
527619
case SgprB512:
528620
case UniInVcc:
529621
case UniInVgprS32:
622+
case UniInVgprV2S16:
530623
case UniInVgprV4S32:
531624
case UniInVgprB32:
532625
case UniInVgprB64:
@@ -548,6 +641,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
548641
case VgprP3:
549642
case VgprP4:
550643
case VgprP5:
644+
case VgprV2S16:
645+
case VgprV2S32:
551646
case VgprV4S32:
552647
case VgprB32:
553648
case VgprB64:
@@ -585,6 +680,8 @@ void RegBankLegalizeHelper::applyMappingDst(
585680
case SgprP3:
586681
case SgprP4:
587682
case SgprP5:
683+
case SgprV2S16:
684+
case SgprV2S32:
588685
case SgprV4S32:
589686
case Vgpr16:
590687
case Vgpr32:
@@ -594,6 +691,8 @@ void RegBankLegalizeHelper::applyMappingDst(
594691
case VgprP3:
595692
case VgprP4:
596693
case VgprP5:
694+
case VgprV2S16:
695+
case VgprV2S32:
597696
case VgprV4S32: {
598697
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
599698
assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
@@ -628,6 +727,7 @@ void RegBankLegalizeHelper::applyMappingDst(
628727
break;
629728
}
630729
case UniInVgprS32:
730+
case UniInVgprV2S16:
631731
case UniInVgprV4S32: {
632732
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
633733
assert(RB == SgprRB);
@@ -701,6 +801,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
701801
case SgprP3:
702802
case SgprP4:
703803
case SgprP5:
804+
case SgprV2S16:
805+
case SgprV2S32:
704806
case SgprV4S32: {
705807
assert(Ty == getTyFromID(MethodIDs[i]));
706808
assert(RB == getRegBankFromID(MethodIDs[i]));
@@ -726,6 +828,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
726828
case VgprP3:
727829
case VgprP4:
728830
case VgprP5:
831+
case VgprV2S16:
832+
case VgprV2S32:
729833
case VgprV4S32: {
730834
assert(Ty == getTyFromID(MethodIDs[i]));
731835
if (RB != VgprRB) {

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

+5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "AMDGPURegBankLegalizeRules.h"
1313
#include "llvm/ADT/SmallSet.h"
1414
#include "llvm/CodeGen/MachineRegisterInfo.h"
15+
#include "llvm/CodeGen/Register.h"
1516

1617
namespace llvm {
1718

@@ -108,6 +109,10 @@ class RegBankLegalizeHelper {
108109

109110
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
110111
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
112+
113+
std::pair<Register, Register> unpackZExt(Register Reg);
114+
std::pair<Register, Register> unpackSExt(Register Reg);
115+
std::pair<Register, Register> unpackAExt(Register Reg);
111116
};
112117

113118
} // end namespace AMDGPU

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

+43-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
#include "AMDGPURegBankLegalizeRules.h"
1717
#include "AMDGPUInstrInfo.h"
1818
#include "GCNSubtarget.h"
19+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1920
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
21+
#include "llvm/CodeGen/MachineInstr.h"
2022
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2123
#include "llvm/IR/IntrinsicsAMDGPU.h"
2224
#include "llvm/Support/AMDGPUAddrSpace.h"
@@ -60,6 +62,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
6062
return MRI.getType(Reg) == LLT::pointer(4, 64);
6163
case P5:
6264
return MRI.getType(Reg) == LLT::pointer(5, 32);
65+
case V2S32:
66+
return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
6367
case V4S32:
6468
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
6569
case B32:
@@ -92,6 +96,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
9296
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
9397
case UniP5:
9498
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
99+
case UniV2S16:
100+
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
95101
case UniB32:
96102
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
97103
case UniB64:
@@ -122,6 +128,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
122128
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
123129
case DivP5:
124130
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
131+
case DivV2S16:
132+
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
125133
case DivB32:
126134
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
127135
case DivB64:
@@ -434,7 +442,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
434442
MachineRegisterInfo &_MRI)
435443
: ST(&_ST), MRI(&_MRI) {
436444

437-
addRulesForGOpcs({G_ADD}, Standard)
445+
addRulesForGOpcs({G_ADD, G_SUB}, Standard)
438446
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
439447
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
440448

@@ -451,11 +459,36 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
451459
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
452460

453461
addRulesForGOpcs({G_SHL}, Standard)
462+
.Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
463+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
464+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
465+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
466+
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
467+
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
454468
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
469+
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
470+
471+
addRulesForGOpcs({G_LSHR}, Standard)
472+
.Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
473+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
474+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
475+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
476+
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
455477
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
478+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
456479
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
457480

458-
addRulesForGOpcs({G_LSHR}, Standard).Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}});
481+
addRulesForGOpcs({G_ASHR}, Standard)
482+
.Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
483+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
484+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
485+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
486+
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
487+
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
488+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
489+
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
490+
491+
addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
459492

460493
addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
461494
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, Uni_BFE})
@@ -514,6 +547,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
514547
.Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
515548
.Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
516549
.Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
550+
.Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
551+
.Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
517552
// This is non-trivial. VgprToVccCopy is done using compare instruction.
518553
.Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
519554
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
@@ -549,6 +584,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
549584
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
550585
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
551586

587+
addRulesForGOpcs({G_SEXT_INREG})
588+
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
589+
.Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
590+
.Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
591+
.Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SExtInRegSplitTo32}});
592+
552593
bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
553594
bool hasSMRDSmall = ST->hasScalarSubwordLoads();
554595

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

+11
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ enum UniformityLLTOpPredicateID {
7575
V3S32,
7676
V4S32,
7777

78+
UniV2S16,
79+
80+
DivV2S16,
81+
7882
// B types
7983
B32,
8084
B64,
@@ -117,7 +121,9 @@ enum RegBankLLTMappingApplyID {
117121
SgprP3,
118122
SgprP4,
119123
SgprP5,
124+
SgprV2S16,
120125
SgprV4S32,
126+
SgprV2S32,
121127
SgprB32,
122128
SgprB64,
123129
SgprB96,
@@ -134,6 +140,8 @@ enum RegBankLLTMappingApplyID {
134140
VgprP3,
135141
VgprP4,
136142
VgprP5,
143+
VgprV2S16,
144+
VgprV2S32,
137145
VgprB32,
138146
VgprB64,
139147
VgprB96,
@@ -145,6 +153,7 @@ enum RegBankLLTMappingApplyID {
145153
// Dst only modifiers: read-any-lane and truncs
146154
UniInVcc,
147155
UniInVgprS32,
156+
UniInVgprV2S16,
148157
UniInVgprV4S32,
149158
UniInVgprB32,
150159
UniInVgprB64,
@@ -173,13 +182,15 @@ enum LoweringMethodID {
173182
DoNotLower,
174183
VccExtToSel,
175184
UniExtToSel,
185+
SExtInRegSplitTo32,
176186
Uni_BFE,
177187
Div_BFE,
178188
VgprToVccCopy,
179189
SplitTo32,
180190
SplitTo32Sel,
181191
Ext32To64,
182192
UniCstExt,
193+
Unpack,
183194
SplitLoad,
184195
WidenLoad,
185196
};

0 commit comments

Comments
 (0)