From 2fdf172213d449b78bc6de1ac20d493adda29dbc Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 14 Apr 2025 16:35:19 +0200 Subject: [PATCH] AMDGPU/GlobalISel: add RegBankLegalize rules for bit shifts and sext-inreg Uniform S16 shifts have to be extended to S32 using appropriate Extend before lowering to S32 instruction. Uniform packed V2S16 are lowered to SGPR S32 instructions, other option is to use VALU packed V2S16 and ReadAnyLane. For uniform S32 and S64 and divergent S16, S32, S64 and V2S16 there are instructions available. --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 2 +- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 107 ++++++++++ .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 43 +++- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 11 ++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 10 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 187 +++++++++--------- .../AMDGPU/GlobalISel/regbankselect-ashr.mir | 6 +- .../AMDGPU/GlobalISel/regbankselect-lshr.mir | 17 +- .../GlobalISel/regbankselect-sext-inreg.mir | 24 +-- .../AMDGPU/GlobalISel/regbankselect-shl.mir | 6 +- .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 34 ++-- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 10 +- 13 files changed, 311 insertions(+), 151 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index 9544c9f43eeaf..15584f16a0638 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -310,7 +310,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { // Opcodes that support pretty much all combinations of reg banks and LLTs // (except S1). There is no point in writing rules for them. if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || - Opc == AMDGPU::G_MERGE_VALUES) { + Opc == AMDGPU::G_MERGE_VALUES || Opc == G_BITCAST) { RBLHelper.applyMappingTrivial(*MI); continue; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 59cd23847311c..9f240c8e6a7a7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -14,11 +14,13 @@ #include "AMDGPURegBankLegalizeHelper.h" #include "AMDGPUGlobalISelUtils.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPURegBankLegalizeRules.h" #include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Support/AMDGPUAddrSpace.h" @@ -166,6 +168,59 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { MI.eraseFromParent(); } +std::pair RegBankLegalizeHelper::unpackZExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff); + auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask); + auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +std::pair RegBankLegalizeHelper::unpackSExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16); + auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +std::pair RegBankLegalizeHelper::unpackAExt(Register Reg) { + auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg); + auto Lo = PackedS32; + auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16)); + return {Lo.getReg(0), Hi.getReg(0)}; +} + +void RegBankLegalizeHelper::lowerUnpack(MachineInstr &MI) { + Register Lo, Hi; + switch (MI.getOpcode()) { + case AMDGPU::G_SHL: { + auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0); + break; + } + case AMDGPU::G_LSHR: { + auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg()); + Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0); + Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0); + break; + } + case AMDGPU::G_ASHR: { + auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg()); + auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg()); + Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0); + Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0); + break; + } + default: + llvm_unreachable("Unpack lowering not implemented"); + } + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); +} + bool isSignedBFE(MachineInstr &MI) { if (isa(MI)) { switch (MI.getOperand(1).getIntrinsicID()) { @@ -311,6 +366,33 @@ void RegBankLegalizeHelper::lowerSplitTo32Sel(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { + auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg()); + int Amt = MI.getOperand(2).getImm(); + Register Lo, Hi; + // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend + if (Amt <= 32) { + auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0)); + if (Amt == 32) { + // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx + Lo = Freeze.getReg(0); + } else { + // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx + Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0); + } + + auto SignExtCst = B.buildConstant(SgprRB_S32, 31); + Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0); + } else { + // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx + Lo = Op1.getReg(0); + Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0); + } + + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -333,6 +415,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); return; } + case Unpack: + return lowerUnpack(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -399,6 +483,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerSplitTo32(MI); case SplitTo32Sel: return lowerSplitTo32Sel(MI); + case SplitTo32SExtInReg: + return lowerSplitTo32SExtInReg(MI); case SplitLoad: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); unsigned Size = DstTy.getSizeInBits(); @@ -488,6 +574,13 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case SgprP5: case VgprP5: return LLT::pointer(5, 32); + case SgprV2S16: + case VgprV2S16: + case UniInVgprV2S16: + return LLT::fixed_vector(2, 16); + case SgprV2S32: + case VgprV2S32: + return LLT::fixed_vector(2, 32); case SgprV4S32: case VgprV4S32: case UniInVgprV4S32: @@ -561,6 +654,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: case SgprB32: case SgprB64: @@ -570,6 +665,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprB512: case UniInVcc: case UniInVgprS32: + case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: case UniInVgprB64: @@ -591,6 +687,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: case VgprB32: case VgprB64: @@ -628,6 +726,8 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: case Vgpr16: case Vgpr32: @@ -637,6 +737,8 @@ void RegBankLegalizeHelper::applyMappingDst( case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == getRegBankFromID(MethodIDs[OpIdx])); @@ -671,6 +773,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == SgprRB); @@ -744,6 +847,8 @@ void RegBankLegalizeHelper::applyMappingSrc( case SgprP3: case SgprP4: case SgprP5: + case SgprV2S16: + case SgprV2S32: case SgprV4S32: { assert(Ty == getTyFromID(MethodIDs[i])); assert(RB == getRegBankFromID(MethodIDs[i])); @@ -769,6 +874,8 @@ void RegBankLegalizeHelper::applyMappingSrc( case VgprP3: case VgprP4: case VgprP5: + case VgprV2S16: + case VgprV2S32: case VgprV4S32: { assert(Ty == getTyFromID(MethodIDs[i])); if (RB != VgprRB) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index 5c096b1b261be..1369fa2332f04 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -111,10 +111,15 @@ class RegBankLegalizeHelper { SmallSet &SgprWaterfallOperandRegs); void lowerVccExtToSel(MachineInstr &MI); + std::pair unpackZExt(Register Reg); + std::pair unpackSExt(Register Reg); + std::pair unpackAExt(Register Reg); + void lowerUnpack(MachineInstr &MI); void lowerDiv_BFE(MachineInstr &MI); void lowerUni_BFE(MachineInstr &MI); void lowerSplitTo32(MachineInstr &MI); void lowerSplitTo32Sel(MachineInstr &MI); + void lowerSplitTo32SExtInReg(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 9df416aad78b3..9c33f2f06a3b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -60,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case V2S32: + return MRI.getType(Reg) == LLT::fixed_vector(2, 32); case V4S32: return MRI.getType(Reg) == LLT::fixed_vector(4, 32); case B32: @@ -92,6 +94,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); case UniP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniV2S16: + return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg); case UniB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg); case UniB64: @@ -122,6 +126,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg); case DivP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg); + case DivV2S16: + return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg); case DivB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg); case DivB64: @@ -435,7 +441,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, MachineRegisterInfo &_MRI) : ST(&_ST), MRI(&_MRI) { - addRulesForGOpcs({G_ADD}, Standard) + addRulesForGOpcs({G_ADD, G_SUB}, Standard) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); @@ -452,11 +458,36 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32}); addRulesForGOpcs({G_SHL}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + addRulesForGOpcs({G_LSHR}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); - addRulesForGOpcs({G_LSHR}, Standard).Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}); + addRulesForGOpcs({G_ASHR}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}}); addRulesForGOpcs({G_UBFX, G_SBFX}, Standard) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, Uni_BFE}) @@ -515,6 +546,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}}) .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}}) .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}}) + .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}}) + .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}}) // This is non-trivial. VgprToVccCopy is done using compare instruction. .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}) .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}}) @@ -550,6 +583,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}) .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}}); + addRulesForGOpcs({G_SEXT_INREG}) + .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}) + .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}}) + .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}}) + .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}}); + bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; bool hasSMRDSmall = ST->hasScalarSubwordLoads(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 058e58c1a94ce..1bea78c00b0cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -75,6 +75,10 @@ enum UniformityLLTOpPredicateID { V3S32, V4S32, + UniV2S16, + + DivV2S16, + // B types B32, B64, @@ -117,7 +121,9 @@ enum RegBankLLTMappingApplyID { SgprP3, SgprP4, SgprP5, + SgprV2S16, SgprV4S32, + SgprV2S32, SgprB32, SgprB64, SgprB96, @@ -134,6 +140,8 @@ enum RegBankLLTMappingApplyID { VgprP3, VgprP4, VgprP5, + VgprV2S16, + VgprV2S32, VgprB32, VgprB64, VgprB96, @@ -145,6 +153,7 @@ enum RegBankLLTMappingApplyID { // Dst only modifiers: read-any-lane and truncs UniInVcc, UniInVgprS32, + UniInVgprV2S16, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, @@ -173,11 +182,13 @@ enum LoweringMethodID { DoNotLower, VccExtToSel, UniExtToSel, + Unpack, Uni_BFE, Div_BFE, VgprToVccCopy, SplitTo32, SplitTo32Sel, + SplitTo32SExtInReg, Ext32To64, UniCstExt, SplitLoad, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 493e8cef63890..c2c60136e8a0c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_ashr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_ashr_i8: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 784611cf68dd2..ec4e023182808 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_lshr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_lshr_i8: @@ -794,22 +794,22 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s2, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s1 -; GFX9-NEXT: s_lshr_b32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s1, s2, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v2i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s2, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s3 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s1 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s3 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 @@ -989,34 +989,34 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX9-LABEL: s_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s4, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s4, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX9-NEXT: s_and_b32 s2, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v4i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s4, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s4, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s2, s4, s2 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s5 +; GFX10PLUS-NEXT: s_and_b32 s4, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s3 -; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, s5 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> @@ -1190,58 +1190,58 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX9-LABEL: s_lshr_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s8, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s4, s8, s4 +; GFX9-NEXT: s_lshr_b32 s0, s0, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX9-NEXT: s_and_b32 s4, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s1, s1, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: s_lshr_b32 s5, s6, 16 +; GFX9-NEXT: s_lshr_b32 s4, s4, s6 +; GFX9-NEXT: s_lshr_b32 s2, s2, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s4, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NEXT: s_lshr_b32 s5, s7, 16 -; GFX9-NEXT: s_lshr_b32 s3, s3, s7 -; GFX9-NEXT: s_lshr_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s4, s7 +; GFX9-NEXT: s_lshr_b32 s3, s3, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_v8i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s8, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s9, s4, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s4 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s8, s9 -; GFX10PLUS-NEXT: s_lshr_b32 s8, s1, 16 -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s4, s8, s4 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s9 +; GFX10PLUS-NEXT: s_and_b32 s8, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s9, s5, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s8, s9 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10PLUS-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s5, s8, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s9 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s5, s1 +; GFX10PLUS-NEXT: s_and_b32 s4, s2, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s5 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10PLUS-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s5 +; GFX10PLUS-NEXT: s_and_b32 s5, s3, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, 16 ; GFX10PLUS-NEXT: s_lshr_b32 s6, s7, 16 -; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s7 -; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s6 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s7 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s6 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s5, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> @@ -1574,8 +1574,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-LABEL: v_lshr_i65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, 1 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 +; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 @@ -1596,8 +1597,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-LABEL: v_lshr_i65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1618,8 +1620,9 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-LABEL: v_lshr_i65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] @@ -1640,6 +1643,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX10-LABEL: v_lshr_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 @@ -1662,21 +1666,22 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX11-LABEL: v_lshr_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, 1 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] ; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, %amount @@ -1688,8 +1693,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, 1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1700,8 +1706,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1712,8 +1719,9 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1724,6 +1732,7 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v3 @@ -1735,8 +1744,8 @@ define i65 @v_lshr_i65_33(i65 %value) { ; GFX11-LABEL: v_lshr_i65_33: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v2 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v3 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir index 615cfec2b31cf..a0cb85f710443 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: ashr_s32_ss @@ -206,8 +205,7 @@ body: | ; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:sgpr(s32) = G_ASHR [[SEXT_INREG]], [[SEXT_INREG1]](s32) ; CHECK-NEXT: [[ASHR3:%[0-9]+]]:sgpr(s32) = G_ASHR [[ASHR]], [[ASHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR2]](s32), [[ASHR3]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir index c5024924a4d32..60b89bf42031d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: lshr_s32_ss @@ -201,15 +200,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[AND]], [[AND1]](s32) ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:sgpr(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir index cf0ca2c9eb634..1a8fa56a7f799 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sext-inreg.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s --- name: sext_inreg_s_s32_1 @@ -137,7 +136,7 @@ body: | ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[FREEZE]], 1 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SEXT_INREG]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -162,7 +161,7 @@ body: | ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[FREEZE]], 31 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SEXT_INREG]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -186,7 +185,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:vgpr(s32) = G_FREEZE [[UV]] - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[FREEZE]], [[C]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[FREEZE]](s32), [[ASHR]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) @@ -209,9 +208,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 1 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 33 @@ -232,9 +230,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 3 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 3 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 35 @@ -255,9 +252,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[COPY1]], 31 - ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY1]](s32), [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:vgpr(s32) = G_SEXT_INREG [[UV1]], 31 + ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UV]](s32), [[SEXT_INREG]](s32) ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SEXT_INREG %0, 63 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir index b4290ea0a4203..6bdf8e7e1de6f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-shl.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: shl_s32_ss @@ -204,8 +203,7 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[BITCAST]], [[BITCAST1]](s32) ; CHECK-NEXT: [[SHL1:%[0-9]+]]:sgpr(s32) = G_SHL [[LSHR]], [[LSHR1]](s32) ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SHL]](s32), [[SHL1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index 8300e2542d452..0bd18c2cfb534 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_sext_inreg_i8_4(i8 %value) { ; GCN-LABEL: v_sext_inreg_i8_4: @@ -1200,13 +1200,13 @@ define i64 @v_sext_inreg_i64_23(i64 %value) { ; GCN-LABEL: v_sext_inreg_i64_23: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 9 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 9 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_i64_23: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 9 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 9 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %value, 23 %ashr = ashr i64 %shl, 23 @@ -1293,13 +1293,13 @@ define i64 @v_sext_inreg_i64_31(i64 %value) { ; GCN-LABEL: v_sext_inreg_i64_31: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_i64_31: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %value, 31 %ashr = ashr i64 %shl, 31 @@ -1385,15 +1385,15 @@ define <2 x i64> @v_sext_inreg_v2i64_16(<2 x i64> %value) { ; GCN-LABEL: v_sext_inreg_v2i64_16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_v2i64_16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX10PLUS-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl <2 x i64> %value, %ashr = ashr <2 x i64> %shl, @@ -1404,15 +1404,15 @@ define <2 x i64> @v_sext_inreg_v2i64_31(<2 x i64> %value) { ; GCN-LABEL: v_sext_inreg_v2i64_31: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_i32 v1, v0, 0, 1 -; GCN-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GCN-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_sext_inreg_v2i64_31: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 1 -; GFX10PLUS-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %shl = shl <2 x i64> %value, %ashr = ashr <2 x i64> %shl, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index b12e915c7d21b..1692ac4479582 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_shl_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_shl_i8: