-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GlobalISel: add RegBankLegalize rules for bit shifts and sext-inreg #132385
base: users/petar-avramovic/select
Are you sure you want to change the base?
AMDGPU/GlobalISel: add RegBankLegalize rules for bit shifts and sext-inreg #132385
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesUniform S16 shifts have to be extended to S32 using appropriate Extend Patch is 48.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132385.diff 13 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 44f1b5419abb9..4fd776bec9492 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -23,6 +23,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -306,7 +307,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
// Opcodes that support pretty much all combinations of reg banks and LLTs
// (except S1). There is no point in writing rules for them.
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
- Opc == AMDGPU::G_MERGE_VALUES) {
+ Opc == AMDGPU::G_MERGE_VALUES || Opc == G_BITCAST) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 0f5f3545ac8eb..59f16315bbd72 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -14,13 +14,16 @@
#include "AMDGPURegBankLegalizeHelper.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegBankLegalizeRules.h"
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/ErrorHandling.h"
#define DEBUG_TYPE "amdgpu-regbanklegalize"
@@ -130,6 +133,28 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
MI.eraseFromParent();
}
+std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
+ auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
+ auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
+ auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
+ auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
+ return {Lo.getReg(0), Hi.getReg(0)};
+}
+
+std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
+ auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
+ auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
+ auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
+ return {Lo.getReg(0), Hi.getReg(0)};
+}
+
+std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
+ auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
+ auto Lo = PackedS32;
+ auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
+ return {Lo.getReg(0), Hi.getReg(0)};
+}
+
void RegBankLegalizeHelper::lower(MachineInstr &MI,
const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &WaterfallSgprs) {
@@ -259,6 +284,33 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
MI.eraseFromParent();
break;
}
+ case SExtInRegSplitTo32: {
+ auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
+ int Amt = MI.getOperand(2).getImm();
+ Register Lo, Hi;
+ // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
+ if (Amt <= 32) {
+ auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
+ if (Amt == 32) {
+ // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
+ Lo = Freeze.getReg(0);
+ } else {
+ // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
+ Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
+ }
+
+ auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
+ Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
+ } else {
+ // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
+ Lo = Op1.getReg(0);
+ Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
+ }
+
+ B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
+ MI.eraseFromParent();
+ break;
+ }
case Div_BFE: {
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == LLT::scalar(64));
@@ -356,6 +408,37 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
MI.eraseFromParent();
return;
}
+ case Unpack: {
+ Register Lo, Hi;
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_SHL: {
+ auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
+ auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
+ Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
+ Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
+ break;
+ }
+ case AMDGPU::G_LSHR: {
+ auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
+ auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
+ Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
+ Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
+ break;
+ }
+ case AMDGPU::G_ASHR: {
+ auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
+ auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
+ Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
+ Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
+ break;
+ }
+ default:
+ llvm_unreachable("Unpack lowering not implemented");
+ }
+ B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
+ MI.eraseFromParent();
+ return;
+ }
case SplitLoad: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
unsigned Size = DstTy.getSizeInBits();
@@ -445,6 +528,13 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case SgprP5:
case VgprP5:
return LLT::pointer(5, 32);
+ case SgprV2S16:
+ case VgprV2S16:
+ case UniInVgprV2S16:
+ return LLT::fixed_vector(2, 16);
+ case SgprV2S32:
+ case VgprV2S32:
+ return LLT::fixed_vector(2, 32);
case SgprV4S32:
case VgprV4S32:
case UniInVgprV4S32:
@@ -518,6 +608,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprV2S16:
+ case SgprV2S32:
case SgprV4S32:
case SgprB32:
case SgprB64:
@@ -527,6 +619,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprB512:
case UniInVcc:
case UniInVgprS32:
+ case UniInVgprV2S16:
case UniInVgprV4S32:
case UniInVgprB32:
case UniInVgprB64:
@@ -548,6 +641,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case VgprP3:
case VgprP4:
case VgprP5:
+ case VgprV2S16:
+ case VgprV2S32:
case VgprV4S32:
case VgprB32:
case VgprB64:
@@ -585,6 +680,8 @@ void RegBankLegalizeHelper::applyMappingDst(
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprV2S16:
+ case SgprV2S32:
case SgprV4S32:
case Vgpr16:
case Vgpr32:
@@ -594,6 +691,8 @@ void RegBankLegalizeHelper::applyMappingDst(
case VgprP3:
case VgprP4:
case VgprP5:
+ case VgprV2S16:
+ case VgprV2S32:
case VgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
@@ -628,6 +727,7 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
+ case UniInVgprV2S16:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
assert(RB == SgprRB);
@@ -701,6 +801,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprV2S16:
+ case SgprV2S32:
case SgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[i]));
assert(RB == getRegBankFromID(MethodIDs[i]));
@@ -726,6 +828,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
case VgprP3:
case VgprP4:
case VgprP5:
+ case VgprV2S16:
+ case VgprV2S32:
case VgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[i]));
if (RB != VgprRB) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index ae3ab86449dd5..e9aa97c2979d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -12,6 +12,7 @@
#include "AMDGPURegBankLegalizeRules.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
namespace llvm {
@@ -108,6 +109,10 @@ class RegBankLegalizeHelper {
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
+
+ std::pair<Register, Register> unpackZExt(Register Reg);
+ std::pair<Register, Register> unpackSExt(Register Reg);
+ std::pair<Register, Register> unpackAExt(Register Reg);
};
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 96b0a7d634f7e..78149472b0b09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -16,7 +16,9 @@
#include "AMDGPURegBankLegalizeRules.h"
#include "AMDGPUInstrInfo.h"
#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
@@ -60,6 +62,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64);
case P5:
return MRI.getType(Reg) == LLT::pointer(5, 32);
+ case V2S32:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
case V4S32:
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
case B32:
@@ -92,6 +96,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
case UniP5:
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
+ case UniV2S16:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
case UniB32:
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
case UniB64:
@@ -122,6 +128,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
case DivP5:
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
+ case DivV2S16:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
case DivB32:
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
case DivB64:
@@ -434,7 +442,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
MachineRegisterInfo &_MRI)
: ST(&_ST), MRI(&_MRI) {
- addRulesForGOpcs({G_ADD}, Standard)
+ addRulesForGOpcs({G_ADD, G_SUB}, Standard)
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
@@ -451,11 +459,36 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
addRulesForGOpcs({G_SHL}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
+
+ addRulesForGOpcs({G_LSHR}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
- addRulesForGOpcs({G_LSHR}, Standard).Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}});
+ addRulesForGOpcs({G_ASHR}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
+
+ addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, Uni_BFE})
@@ -514,6 +547,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
.Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
.Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
+ .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
+ .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
// This is non-trivial. VgprToVccCopy is done using compare instruction.
.Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
@@ -549,6 +584,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
+ addRulesForGOpcs({G_SEXT_INREG})
+ .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
+ .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
+ .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
+ .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SExtInRegSplitTo32}});
+
bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
bool hasSMRDSmall = ST->hasScalarSubwordLoads();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 058e58c1a94ce..435323cbb9df4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -75,6 +75,10 @@ enum UniformityLLTOpPredicateID {
V3S32,
V4S32,
+ UniV2S16,
+
+ DivV2S16,
+
// B types
B32,
B64,
@@ -117,7 +121,9 @@ enum RegBankLLTMappingApplyID {
SgprP3,
SgprP4,
SgprP5,
+ SgprV2S16,
SgprV4S32,
+ SgprV2S32,
SgprB32,
SgprB64,
SgprB96,
@@ -134,6 +140,8 @@ enum RegBankLLTMappingApplyID {
VgprP3,
VgprP4,
VgprP5,
+ VgprV2S16,
+ VgprV2S32,
VgprB32,
VgprB64,
VgprB96,
@@ -145,6 +153,7 @@ enum RegBankLLTMappingApplyID {
// Dst only modifiers: read-any-lane and truncs
UniInVcc,
UniInVgprS32,
+ UniInVgprV2S16,
UniInVgprV4S32,
UniInVgprB32,
UniInVgprB64,
@@ -173,6 +182,7 @@ enum LoweringMethodID {
DoNotLower,
VccExtToSel,
UniExtToSel,
+ SExtInRegSplitTo32,
Uni_BFE,
Div_BFE,
VgprToVccCopy,
@@ -180,6 +190,7 @@ enum LoweringMethodID {
SplitTo32Sel,
Ext32To64,
UniCstExt,
+ Unpack,
SplitLoad,
WidenLoad,
};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 493e8cef63890..c2c60136e8a0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i8 @v_ashr_i8(i8 %value, i8 %amount) {
; GFX6-LABEL: v_ashr_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 784611cf68dd2..ec4e023182808 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i8 @v_lshr_i8(i8 %value, i8 %amount) {
; GFX6-LABEL: v_lshr_i8:
@@ -794,22 +794,22 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX9-LABEL: s_lshr_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
-; GFX9-NEXT: s_lshr_b32 s0, s0, s1
-; GFX9-NEXT: s_lshr_b32 s1, s2, s3
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT: s_lshr_b32 s1, s2, s1
+; GFX9-NEXT: s_lshr_b32 s0, s0, s3
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_v2i16:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
-; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16
; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1
-; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s3
-; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s1
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s3
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX10PLUS-NEXT: ; return t...
[truncated]
|
…inreg Uniform S16 shifts have to be extended to S32 using appropriate Extend before lowering to S32 instruction. Uniform packed V2S16 are lowered to SGPR S32 instructions, other option is to use VALU packed V2S16 and ReadAnyLane. For uniform S32 and S64 and divergent S16, S32, S64 and V2S16 there are instructions available.
9a0eaa1
to
9f92e94
Compare
Uniform S16 shifts have to be extended to S32 using appropriate Extend
before lowering to S32 instruction.
Uniform packed V2S16 are lowered to SGPR S32 instructions,
other option is to use VALU packed V2S16 and ReadAnyLane.
For uniform S32 and S64 and divergent S16, S32, S64 and V2S16 there are
instructions available.