From b7d635ed30da49cc32b5b46d00e67ecc3ff9522f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 18 Nov 2024 08:38:35 -0800 Subject: [PATCH 001/366] AMDGPU: Copy correct predicates for SDWA reals (#116288) There are a lot of messes in the special case predicate handling. Currently broad let blocks override specific predicates with more general cases. For instructions with SDWA, the HasSDWA predicate was overriding the SubtargetPredicate for the instruction. This fixes enough to properly disallow new instructions that support SDWA on older targets. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 +++-- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 ++-- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 26 ++++++++++++---------- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 2 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 11 +++++---- 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index c8ae010414dc4..d7feaef8c4a97 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2103,8 +2103,10 @@ def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes() def HasFminFmaxLegacy : Predicate<"Subtarget->hasFminFmaxLegacy()">; -def HasSDWA : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<(all_of FeatureSDWA, FeatureVolcanicIslands)>; +def HasSDWA : Predicate<"Subtarget->hasSDWA()">; + +def HasSDWA8 : Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<(all_of (not FeatureGFX9Insts), FeatureSDWA)>; def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index c743eb43e3465..f7a66a0820939 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1268,7 +1268,7 @@ multiclass VOP1_Real_vi op> { if !cast(NAME#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : - VOP_SDWA_Real (NAME#"_sdwa")>, + VOP_SDWA8_Real (NAME#"_sdwa")>, VOP1_SDWAe (NAME#"_sdwa").Pfl>; if !cast(NAME#"_e32").Pfl.HasExtSDWA9 then @@ -1474,7 +1474,7 @@ def : GCNPat < // GFX9 //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { +let DecoderNamespace = "GFX9" in { multiclass VOP1_Real_gfx9 op> { defm NAME : VOP1_Real_e32e64_vi ; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 925b60561c9d6..c0d38fa52b344 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -766,16 +766,16 @@ defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, " defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">; -let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in { +let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in { defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32">; defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32">; } -let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1, isAdd = 1 in { +let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1, isAdd = 1 in { defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32">; } -let isAdd = 1 in { +let isAdd = 1 in { defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32">; defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">; } @@ -2290,10 +2290,10 @@ multiclass Base_VOP2_Real_e32e64_vi op> : } // End AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" -multiclass VOP2_SDWA_Real op> { +multiclass VOP2_SDWA8_Real op> { if !cast(NAME#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : - VOP_SDWA_Real (NAME#"_sdwa")>, + VOP_SDWA8_Real (NAME#"_sdwa")>, VOP2_SDWAe (NAME#"_sdwa").Pfl>; } @@ -2321,7 +2321,7 @@ multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName } if !cast(OpName#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : - VOP_SDWA_Real (OpName#"_sdwa")>, + VOP_SDWA8_Real (OpName#"_sdwa")>, VOP2_SDWAe (OpName#"_sdwa").Pfl> { VOP2_SDWA_Pseudo ps = !cast(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; @@ -2337,7 +2337,7 @@ multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName } // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" -let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { +let DecoderNamespace = "GFX9" in { multiclass VOP2be_Real_e32e64_gfx9 op, string OpName, string AsmName> { def _e32_gfx9 : @@ -2386,10 +2386,10 @@ multiclass VOP2_Real_e32e64_gfx9 op> { VOP2_DPPe(NAME#"_dpp")>; } -} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" +} // End DecoderNamespace = "GFX9" multiclass VOP2_Real_e32e64_vi op> : - Base_VOP2_Real_e32e64_vi, VOP2_SDWA_Real, VOP2_SDWA9_Real { + Base_VOP2_Real_e32e64_vi, VOP2_SDWA8_Real, VOP2_SDWA9_Real { if !cast(NAME#"_e32").Pfl.HasExtDPP then def _dpp_vi : @@ -2401,7 +2401,7 @@ defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>; defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>; defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>; defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>; -let AssemblerPredicate = isGCN3ExcludingGFX90A in +let OtherPredicates = [isGCN3ExcludingGFX90A] in defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>; defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>; defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>; @@ -2431,6 +2431,7 @@ defm V_ADDC_U32 : VOP2be_Real_e32e64_vi_only <0x1c, "V_ADDC_U32", " defm V_SUBB_U32 : VOP2be_Real_e32e64_vi_only <0x1d, "V_SUBB_U32", "v_subb_u32">; defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1e, "V_SUBBREV_U32", "v_subbrev_u32">; +let AssemblerPredicate = isGFX9Only in { defm V_ADD_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_CO_U32", "v_add_co_u32">; defm V_SUB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_CO_U32", "v_sub_co_u32">; defm V_SUBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_CO_U32", "v_subrev_co_u32">; @@ -2441,6 +2442,7 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; +} // End AssemblerPredicate = isGFX9Only defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>; defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>; @@ -2518,7 +2520,7 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; } // End SubtargetPredicate = HasDLInsts -let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in { +let DecoderNamespace = "GFX90A" in { multiclass VOP2_Real_e32_gfx90a op> { def _e32_gfx90a : VOP2_Real(NAME#"_e32"), SIEncodingFamily.GFX90A>, @@ -2551,7 +2553,7 @@ let SubtargetPredicate = HasFmacF64Inst in { defm V_FMAC_F64 : VOP2_Real_e32e64_gfx90a <0x4>; } // End SubtargetPredicate = HasFmacF64Inst -let SubtargetPredicate = isGFX90APlus, IsSingle = 1 in { +let IsSingle = 1 in { defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>; } diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index d6e08dce130ce..f4ccae1decb1d 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -2290,7 +2290,7 @@ multiclass VOPC_Real_vi op> { if !cast(NAME#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : - VOP_SDWA_Real (NAME#"_sdwa")>, + VOP_SDWA8_Real (NAME#"_sdwa")>, VOPC_SDWAe (NAME#"_sdwa").Pfl>; if !cast(NAME#"_e32").Pfl.HasExtSDWA9 then diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index aab5dc7465d93..1be434c2c11f7 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -650,7 +650,6 @@ class VOP_SDWA_Pseudo pattern=[]> : let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); let SubtargetPredicate = HasSDWA; - let AssemblerPredicate = HasSDWA; let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA, AMDGPUAsmVariants.Disable); let DecoderNamespace = "GFX8"; @@ -658,7 +657,7 @@ class VOP_SDWA_Pseudo pattern=[]> : VOPProfile Pfl = P; } -class VOP_SDWA_Real : +class VOP_SDWA8_Real : InstSI , SIMCInstr { @@ -676,7 +675,7 @@ class VOP_SDWA_Real : // Copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; - let AssemblerPredicate = ps.AssemblerPredicate; + let AssemblerPredicate = HasSDWA8; let AsmMatchConverter = ps.AsmMatchConverter; let AsmVariantName = ps.AsmVariantName; let UseNamedOperandTable = ps.UseNamedOperandTable; @@ -708,7 +707,7 @@ class Base_VOP_SDWA9_Real : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; - let SubtargetPredicate = HasSDWA9; + let SubtargetPredicate = ps.SubtargetPredicate; let AssemblerPredicate = HasSDWA9; let OtherPredicates = ps.OtherPredicates; let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9, @@ -735,7 +734,7 @@ class VOP_SDWA9_Real : SIMCInstr ; class Base_VOP_SDWA10_Real : Base_VOP_SDWA9_Real { - let SubtargetPredicate = HasSDWA10; + let SubtargetPredicate = ps.SubtargetPredicate; let AssemblerPredicate = HasSDWA10; let DecoderNamespace = "GFX10"; } @@ -1508,7 +1507,7 @@ class VOP3_DPP16_t16_Helper op, VOP_DPP_Pseudo ps, let SchedRW = ps.SchedRW; let Uses = ps.Uses; let AssemblerPredicate = HasDPP16; - let SubtargetPredicate = HasDPP16; + let SubtargetPredicate = ps.SubtargetPredicate; let OtherPredicates = ps.OtherPredicates; } From 6bf8f08989420ccd10efed5fac88052ca16e1250 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 18 Nov 2024 08:56:25 -0800 Subject: [PATCH 002/366] [memprof] Add InstrProfWriter::addMemProfData (#116528) This patch adds InstrProfWriter::addMemProfData, which adds the complete MemProf profile (frames, call stacks, and records) to the writer context. Without this function, functions like loadInput in llvm-profdata.cpp and InstrProfWriter::mergeRecordsFromWriter must add one item (frame, call stack, or record) at a time. The new function std::moves the entire MemProf profile to the writer context if the destination is empty, which is the common use case. Otherwise, we fall back to adding one item at a time behind the scene. Here are a couple of reasons why we should add this function: - We've had a bug where we forgot to add one of the three data structures (frames, call stacks, and records) to the writer context, resulting in a nearly empty indexed profile. We should always package the three data structures together, especially on API boundaries. - We expose a little too much of the MemProf detail to InstrProfWriter. I'd like to gradually transform InstrProfReader/Writer to entities managing buffers (sequences of bytes), with actual serialization/deserialization left to external classes. We already do some of this in InstrProfReader, where InstrProfReader "contracts out" to IndexedMemProfReader to handle MemProf details. I am not changing loadInput or InstrProfWriter::mergeRecordsFromWriter for now because MemProfReader uses DenseMap for frames and call stacks, whereas MemProfData uses MapVector. I'll resolve these mismatches in subsequent patches. --- .../llvm/ProfileData/InstrProfWriter.h | 4 + llvm/lib/ProfileData/InstrProfWriter.cpp | 29 +++++ llvm/unittests/ProfileData/InstrProfTest.cpp | 112 ++++++++---------- 3 files changed, 84 insertions(+), 61 deletions(-) diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h index 199e565bead04..fa30926c66258 100644 --- a/llvm/include/llvm/ProfileData/InstrProfWriter.h +++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h @@ -130,6 +130,10 @@ class InstrProfWriter { const llvm::SmallVector &CallStack, function_ref Warn); + /// Add the entire MemProfData \p Incoming to the writer context. + bool addMemProfData(memprof::IndexedMemProfData Incoming, + function_ref Warn); + // Add a binary id to the binary ids list. void addBinaryIds(ArrayRef BIs); diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 47f463541d8ef..87a538f35c786 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -350,6 +350,35 @@ bool InstrProfWriter::addMemProfCallStack( return true; } +bool InstrProfWriter::addMemProfData(memprof::IndexedMemProfData Incoming, + function_ref Warn) { + // TODO: Once we remove support for MemProf format Version V1, assert that + // the three components (frames, call stacks, and records) are either all + // empty or populated. + + if (MemProfData.Frames.empty()) + MemProfData.Frames = std::move(Incoming.Frames); + else + for (const auto &[Id, F] : Incoming.Frames) + if (addMemProfFrame(Id, F, Warn)) + return false; + + if (MemProfData.CallStacks.empty()) + MemProfData.CallStacks = std::move(Incoming.CallStacks); + else + for (const auto &[CSId, CS] : Incoming.CallStacks) + if (addMemProfCallStack(CSId, CS, Warn)) + return false; + + if (MemProfData.Records.empty()) + MemProfData.Records = std::move(Incoming.Records); + else + for (const auto &[GUID, Record] : Incoming.Records) + addMemProfRecord(GUID, Record); + + return true; +} + void InstrProfWriter::addBinaryIds(ArrayRef BIs) { llvm::append_range(BinaryIds, BIs); } diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp index 582efad531bf7..b9f244104c65c 100644 --- a/llvm/unittests/ProfileData/InstrProfTest.cpp +++ b/llvm/unittests/ProfileData/InstrProfTest.cpp @@ -21,6 +21,7 @@ #include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" #include +#include #include using namespace llvm; @@ -348,10 +349,10 @@ TEST_F(InstrProfTest, test_merge_traces_sampled) { using ::llvm::memprof::IndexedMemProfRecord; using ::llvm::memprof::MemInfoBlock; using FrameIdMapTy = - llvm::DenseMap<::llvm::memprof::FrameId, ::llvm::memprof::Frame>; + llvm::MapVector<::llvm::memprof::FrameId, ::llvm::memprof::Frame>; using CallStackIdMapTy = - llvm::DenseMap<::llvm::memprof::CallStackId, - ::llvm::SmallVector<::llvm::memprof::FrameId>>; + llvm::MapVector<::llvm::memprof::CallStackId, + ::llvm::SmallVector<::llvm::memprof::FrameId>>; static FrameIdMapTy getFrameMapping() { FrameIdMapTy Mapping; @@ -467,11 +468,11 @@ TEST_F(InstrProfTest, test_memprof_v0) { /*CallSiteFrames=*/{ {4, 5}, }); - const FrameIdMapTy IdToFrameMap = getFrameMapping(); - for (const auto &I : IdToFrameMap) { - Writer.addMemProfFrame(I.first, I.getSecond(), Err); - } - Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR); + + memprof::IndexedMemProfData MemProfData; + MemProfData.Frames = getFrameMapping(); + MemProfData.Records.try_emplace(0x9999, IndexedMR); + Writer.addMemProfData(MemProfData, Err); auto Profile = Writer.writeBuffer(); readProfile(std::move(Profile)); @@ -482,8 +483,8 @@ TEST_F(InstrProfTest, test_memprof_v0) { std::optional LastUnmappedFrameId; auto IdToFrameCallback = [&](const memprof::FrameId Id) { - auto Iter = IdToFrameMap.find(Id); - if (Iter == IdToFrameMap.end()) { + auto Iter = MemProfData.Frames.find(Id); + if (Iter == MemProfData.Frames.end()) { LastUnmappedFrameId = Id; return memprof::Frame(0, 0, 0, false); } @@ -508,15 +509,11 @@ TEST_F(InstrProfTest, test_memprof_v2_full_schema) { const IndexedMemProfRecord IndexedMR = makeRecordV2( /*AllocFrames=*/{0x111, 0x222}, /*CallSiteFrames=*/{0x333}, MIB, memprof::getFullSchema()); - const FrameIdMapTy IdToFrameMap = getFrameMapping(); - const auto CSIdToCallStackMap = getCallStackMapping(); - for (const auto &I : IdToFrameMap) { - Writer.addMemProfFrame(I.first, I.getSecond(), Err); - } - for (const auto &I : CSIdToCallStackMap) { - Writer.addMemProfCallStack(I.first, I.getSecond(), Err); - } - Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR); + memprof::IndexedMemProfData MemProfData; + MemProfData.Frames = getFrameMapping(); + MemProfData.CallStacks = getCallStackMapping(); + MemProfData.Records.try_emplace(0x9999, IndexedMR); + Writer.addMemProfData(MemProfData, Err); auto Profile = Writer.writeBuffer(); readProfile(std::move(Profile)); @@ -525,9 +522,10 @@ TEST_F(InstrProfTest, test_memprof_v2_full_schema) { ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded()); const memprof::MemProfRecord &Record = RecordOr.get(); - memprof::FrameIdConverter FrameIdConv(IdToFrameMap); - memprof::CallStackIdConverter CSIdConv( - CSIdToCallStackMap, FrameIdConv); + memprof::FrameIdConverter FrameIdConv( + MemProfData.Frames); + memprof::CallStackIdConverter CSIdConv( + MemProfData.CallStacks, FrameIdConv); const ::llvm::memprof::MemProfRecord WantRecord = IndexedMR.toMemProfRecord(CSIdConv); @@ -550,15 +548,11 @@ TEST_F(InstrProfTest, test_memprof_v2_partial_schema) { const IndexedMemProfRecord IndexedMR = makeRecordV2( /*AllocFrames=*/{0x111, 0x222}, /*CallSiteFrames=*/{0x333}, MIB, memprof::getHotColdSchema()); - const FrameIdMapTy IdToFrameMap = getFrameMapping(); - const auto CSIdToCallStackMap = getCallStackMapping(); - for (const auto &I : IdToFrameMap) { - Writer.addMemProfFrame(I.first, I.getSecond(), Err); - } - for (const auto &I : CSIdToCallStackMap) { - Writer.addMemProfCallStack(I.first, I.getSecond(), Err); - } - Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR); + memprof::IndexedMemProfData MemProfData; + MemProfData.Frames = getFrameMapping(); + MemProfData.CallStacks = getCallStackMapping(); + MemProfData.Records.try_emplace(0x9999, IndexedMR); + Writer.addMemProfData(MemProfData, Err); auto Profile = Writer.writeBuffer(); readProfile(std::move(Profile)); @@ -567,9 +561,10 @@ TEST_F(InstrProfTest, test_memprof_v2_partial_schema) { ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded()); const memprof::MemProfRecord &Record = RecordOr.get(); - memprof::FrameIdConverter FrameIdConv(IdToFrameMap); - memprof::CallStackIdConverter CSIdConv( - CSIdToCallStackMap, FrameIdConv); + memprof::FrameIdConverter FrameIdConv( + MemProfData.Frames); + memprof::CallStackIdConverter CSIdConv( + MemProfData.CallStacks, FrameIdConv); const ::llvm::memprof::MemProfRecord WantRecord = IndexedMR.toMemProfRecord(CSIdConv); @@ -601,23 +596,21 @@ TEST_F(InstrProfTest, test_caller_callee_pairs) { // Line: 7, Column: 8 // new(...) - const std::pair Frames[] = { - {0, {0x123, 1, 2, false}}, - {1, {0x234, 3, 4, true}}, - {2, {0x123, 5, 6, false}}, - {3, {0x345, 7, 8, true}}}; - for (const auto &[FrameId, Frame] : Frames) - Writer.addMemProfFrame(FrameId, Frame, Err); - - const std::pair> - CallStacks[] = {{0x111, {1, 0}}, {0x222, {3, 2}}}; - for (const auto &[CSId, CallStack] : CallStacks) - Writer.addMemProfCallStack(CSId, CallStack, Err); - const IndexedMemProfRecord IndexedMR = makeRecordV2( /*AllocFrames=*/{0x111, 0x222}, /*CallSiteFrames=*/{}, MIB, memprof::getHotColdSchema()); - Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR); + + memprof::IndexedMemProfData MemProfData; + MemProfData.Frames.try_emplace(0, 0x123, 1, 2, false); + MemProfData.Frames.try_emplace(1, 0x234, 3, 4, true); + MemProfData.Frames.try_emplace(2, 0x123, 5, 6, false); + MemProfData.Frames.try_emplace(3, 0x345, 7, 8, true); + MemProfData.CallStacks.try_emplace( + 0x111, std::initializer_list{1, 0}); + MemProfData.CallStacks.try_emplace( + 0x222, std::initializer_list{3, 2}); + MemProfData.Records.try_emplace(0x9999, IndexedMR); + Writer.addMemProfData(MemProfData, Err); auto Profile = Writer.writeBuffer(); readProfile(std::move(Profile)); @@ -681,19 +674,15 @@ TEST_F(InstrProfTest, test_memprof_merge) { ASSERT_THAT_ERROR(Writer2.mergeProfileKind(InstrProfKind::MemProf), Succeeded()); - const FrameIdMapTy IdToFrameMap = getFrameMapping(); - for (const auto &I : IdToFrameMap) { - Writer2.addMemProfFrame(I.first, I.getSecond(), Err); - } - - const auto CSIdToCallStackMap = getCallStackMapping(); - for (const auto &[CSId, CallStack] : CSIdToCallStackMap) - Writer2.addMemProfCallStack(CSId, CallStack, Err); - const IndexedMemProfRecord IndexedMR = makeRecordV2( /*AllocFrames=*/{0x111, 0x222}, /*CallSiteFrames=*/{}, makePartialMIB(), memprof::getHotColdSchema()); - Writer2.addMemProfRecord(/*Id=*/0x9999, IndexedMR); + + memprof::IndexedMemProfData MemProfData; + MemProfData.Frames = getFrameMapping(); + MemProfData.CallStacks = getCallStackMapping(); + MemProfData.Records.try_emplace(0x9999, IndexedMR); + Writer2.addMemProfData(MemProfData, Err); ASSERT_THAT_ERROR(Writer.mergeProfileKind(Writer2.getProfileKind()), Succeeded()); @@ -714,9 +703,10 @@ TEST_F(InstrProfTest, test_memprof_merge) { std::optional LastUnmappedFrameId; - memprof::FrameIdConverter FrameIdConv(IdToFrameMap); - memprof::CallStackIdConverter CSIdConv( - CSIdToCallStackMap, FrameIdConv); + memprof::FrameIdConverter FrameIdConv( + MemProfData.Frames); + memprof::CallStackIdConverter CSIdConv( + MemProfData.CallStacks, FrameIdConv); const ::llvm::memprof::MemProfRecord WantRecord = IndexedMR.toMemProfRecord(CSIdConv); From 4092c0deef466e5b96a221e4066a78ae72efa7af Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 18 Nov 2024 09:08:29 -0800 Subject: [PATCH 003/366] [ELF,ARM] Move global sectionMap into the ARM class Otherwise, LLD_IN_TEST=2 testing arm-plt-reloc.s crashes. Follow-up to https://reviews.llvm.org/D150870 --- lld/ELF/Arch/ARM.cpp | 19 +++++++++++-------- lld/ELF/OutputSections.cpp | 2 +- lld/ELF/Target.h | 4 ++-- lld/ELF/Writer.cpp | 2 +- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 100db44544681..c23a2f872d918 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -49,15 +49,15 @@ class ARM final : public TargetInfo { void relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const override; + DenseMap> sectionMap; + private: -void encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val, - int group, bool check) const; + void encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val, + int group, bool check) const; }; enum class CodeState { Data = 0, Thumb = 2, Arm = 4 }; } // namespace -static DenseMap> sectionMap{}; - ARM::ARM(Ctx &ctx) : TargetInfo(ctx) { copyRel = R_ARM_COPY; relativeRel = R_ARM_RELATIVE; @@ -1047,10 +1047,10 @@ static bool isDataMapSymbol(const Symbol *b) { return b->getName() == "$d" || b->getName().starts_with("$d."); } -void elf::sortArmMappingSymbols() { +void elf::sortArmMappingSymbols(Ctx &ctx) { // For each input section make sure the mapping symbols are sorted in // ascending order. - for (auto &kv : sectionMap) { + for (auto &kv : static_cast(*ctx.target).sectionMap) { SmallVector &mapSyms = kv.second; llvm::stable_sort(mapSyms, [](const Defined *a, const Defined *b) { return a->value < b->value; @@ -1063,6 +1063,7 @@ void elf::addArmInputSectionMappingSymbols(Ctx &ctx) { // The linker generated mapping symbols for all the synthetic // sections are adding into the sectionmap through the function // addArmSyntheitcSectionMappingSymbol. + auto §ionMap = static_cast(*ctx.target).sectionMap; for (ELFFileBase *file : ctx.objectFiles) { for (Symbol *sym : file->getLocalSymbols()) { auto *def = dyn_cast(sym); @@ -1088,7 +1089,7 @@ void elf::addArmSyntheticSectionMappingSymbol(Defined *sym) { return; if (auto *sec = cast_if_present(sym->section)) if (sec->flags & SHF_EXECINSTR) - sectionMap[sec].push_back(sym); + static_cast(*sec->file->ctx.target).sectionMap[sec].push_back(sym); } static void toLittleEndianInstructions(uint8_t *buf, uint64_t start, @@ -1109,7 +1110,9 @@ static void toLittleEndianInstructions(uint8_t *buf, uint64_t start, // identify half open intervals of Arm code [$a, non $a) and Thumb code // [$t, non $t) and convert these to little endian a word or half word at a // time respectively. -void elf::convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf) { +void elf::convertArmInstructionstoBE8(Ctx &ctx, InputSection *sec, + uint8_t *buf) { + auto §ionMap = static_cast(*ctx.target).sectionMap; auto it = sectionMap.find(sec); if (it == sectionMap.end()) return; diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp index 094524f9b5379..94cf62d79abb2 100644 --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -536,7 +536,7 @@ void OutputSection::writeTo(Ctx &ctx, uint8_t *buf, parallel::TaskGroup &tg) { // instructions to little-endian, leaving the data big-endian. if (ctx.arg.emachine == EM_ARM && !ctx.arg.isLE && ctx.arg.armBe8 && (flags & SHF_EXECINSTR)) - convertArmInstructionstoBE8(isec, buf + isec->outSecOff); + convertArmInstructionstoBE8(ctx, isec, buf + isec->outSecOff); // Fill gaps between sections. if (nonZeroFiller) { diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h index 2277537a4e357..ce42d3624a8f5 100644 --- a/lld/ELF/Target.h +++ b/lld/ELF/Target.h @@ -246,8 +246,8 @@ void riscvFinalizeRelax(int passes); void mergeRISCVAttributesSections(Ctx &); void addArmInputSectionMappingSymbols(Ctx &); void addArmSyntheticSectionMappingSymbol(Defined *); -void sortArmMappingSymbols(); -void convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf); +void sortArmMappingSymbols(Ctx &); +void convertArmInstructionstoBE8(Ctx &, InputSection *sec, uint8_t *buf); void createTaggedSymbols(Ctx &); void initSymbolAnchors(Ctx &); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 5865ead0ff88b..d698479c9707f 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2095,7 +2095,7 @@ template void Writer::finalizeSections() { if (ctx.arg.emachine == EM_ARM && !ctx.arg.isLE && ctx.arg.armBe8) { addArmInputSectionMappingSymbols(ctx); - sortArmMappingSymbols(); + sortArmMappingSymbols(ctx); } } From 2444b6f0df56d2aeb0ae6dce946443b23a3a9d3b Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 18 Nov 2024 09:09:06 -0800 Subject: [PATCH 004/366] [llvm-objcopy] Replace custom -- parsing with DashDashParsing The custom -- parsing from https://reviews.llvm.org/D102665 can be replaced with the generic feature from https://reviews.llvm.org/D152286 Pull Request: https://github.com/llvm/llvm-project/pull/116565 --- llvm/tools/llvm-objcopy/ObjcopyOptions.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp index 26a888c628d9d..104d802b1e1ee 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp +++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp @@ -58,6 +58,7 @@ class ObjcopyOptTable : public opt::GenericOptTable { public: ObjcopyOptTable() : opt::GenericOptTable(objcopy_opt::ObjcopyInfoTable) { setGroupedShortOptions(true); + setDashDashParsing(true); } }; @@ -650,17 +651,11 @@ parseChangeSectionAddr(StringRef ArgValue, StringRef OptionName, // help flag is set then parseObjcopyOptions will print the help messege and // exit. Expected -objcopy::parseObjcopyOptions(ArrayRef RawArgsArr, +objcopy::parseObjcopyOptions(ArrayRef ArgsArr, function_ref ErrorCallback) { DriverConfig DC; ObjcopyOptTable T; - const char *const *DashDash = - llvm::find_if(RawArgsArr, [](StringRef Str) { return Str == "--"; }); - ArrayRef ArgsArr = ArrayRef(RawArgsArr.begin(), DashDash); - if (DashDash != RawArgsArr.end()) - DashDash = std::next(DashDash); - unsigned MissingArgumentIndex, MissingArgumentCount; llvm::opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount); @@ -671,7 +666,7 @@ objcopy::parseObjcopyOptions(ArrayRef RawArgsArr, "argument to '%s' is missing (expected %d value(s))", InputArgs.getArgString(MissingArgumentIndex), MissingArgumentCount); - if (InputArgs.size() == 0 && DashDash == RawArgsArr.end()) { + if (InputArgs.size() == 0) { printHelp(T, errs(), ToolType::Objcopy); exit(1); } @@ -695,7 +690,6 @@ objcopy::parseObjcopyOptions(ArrayRef RawArgsArr, for (auto *Arg : InputArgs.filtered(OBJCOPY_INPUT)) Positional.push_back(Arg->getValue()); - std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional)); if (Positional.empty()) return createStringError(errc::invalid_argument, "no input file specified"); From c9260e21d092c3acbb77bb9f6fcd0820f6a138c1 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Mon, 18 Nov 2024 09:16:09 -0800 Subject: [PATCH 005/366] [CodeLayout] Do not rebuild chains with -apply-ext-tsp-for-size (#115934) https://github.com/llvm/llvm-project/pull/109711 disables `buildCFGChains()` when `-apply-ext-tsp-for-size` is used to improve codesize. Tail merging can change the layout and normally requires `buildCFGChains()` to be called again, but we want to prevent this when optimizing for codesize. We saw slight size improvement on large binaries with this change. If `-apply-ext-tsp-for-size` is not used, this should be a NFC. --- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index bdad63f368dfe..0f68313e64f54 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3558,14 +3558,16 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI, /*AfterPlacement=*/true)) { - // Redo the layout if tail merging creates/removes/moves blocks. - BlockToChain.clear(); - ComputedEdges.clear(); // Must redo the post-dominator tree if blocks were changed. if (MPDT) MPDT->recalculate(MF); - ChainAllocator.DestroyAll(); - buildCFGChains(); + if (!UseExtTspForSize) { + // Redo the layout if tail merging creates/removes/moves blocks. + BlockToChain.clear(); + ComputedEdges.clear(); + ChainAllocator.DestroyAll(); + buildCFGChains(); + } } } From 1c4caece05f1885ba6ed80755d6b5de1b9f99579 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 18 Nov 2024 09:16:50 -0800 Subject: [PATCH 006/366] [Mips] Use APInt::isMask/isShiftedMask to simplify code. (#116582) --- llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index afb027a533d5a..c3e21e0ff7a0f 100644 --- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -614,11 +614,9 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const { if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) && ImmValue.getBitWidth() == EltTy.getSizeInBits()) { - // Extract the run of set bits starting with bit zero from the bitwise - // inverse of ImmValue, and test that the inverse of this is the same - // as the original value. - if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) { - + // Check if we have a leading one, then check if the whole value is a + // shifted mask. + if (ImmValue.isNegative() && ImmValue.isShiftedMask()) { Imm = CurDAG->getTargetConstant(ImmValue.popcount() - 1, SDLoc(N), EltTy); return true; } @@ -647,9 +645,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const { if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) && ImmValue.getBitWidth() == EltTy.getSizeInBits()) { - // Extract the run of set bits starting with bit zero, and test that the - // result is the same as the original value - if (ImmValue == (ImmValue & ~(ImmValue + 1))) { + if (ImmValue.isMask()) { Imm = CurDAG->getTargetConstant(ImmValue.popcount() - 1, SDLoc(N), EltTy); return true; } From de2e270ee6fb29cfb7730dcf6aaa2552cd4a5efd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 18 Nov 2024 09:22:12 -0800 Subject: [PATCH 007/366] [flang][cuda] Materialize box when src or dst are rebox (#116494) --- .../Optimizer/Transforms/CUFOpConversion.cpp | 2 +- flang/test/Fir/CUDA/cuda-data-transfer.fir | 47 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 9de20f0f0d45e..17699dadc7511 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -654,7 +654,7 @@ struct CUFDataTransferOpConversion loc, builder); } auto materializeBoxIfNeeded = [&](mlir::Value val) -> mlir::Value { - if (mlir::isa(val.getDefiningOp())) { + if (mlir::isa(val.getDefiningOp())) { // Materialize the box to memory to be able to call the runtime. mlir::Value box = builder.createTemporary(loc, val.getType()); builder.create(loc, val, box); diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir index 1ee44f3c6d97c..5f10dc0562d17 100644 --- a/flang/test/Fir/CUDA/cuda-data-transfer.fir +++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir @@ -466,4 +466,51 @@ func.func @_QPlogical_cst() { // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref>>) -> !fir.ref> // CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none +func.func @_QPcallkernel(%arg0: !fir.box>> {fir.bindc_name = "a"}, %arg1: !fir.ref {fir.bindc_name = "b"}, %arg2: !fir.ref {fir.bindc_name = "c"}) { + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFcallkernelEa"} : (!fir.box>>, !fir.dscope) -> !fir.box>> + %2 = fir.rebox %1 : (!fir.box>>) -> !fir.box>> + %3 = cuf.alloc !fir.box>>> {bindc_name = "adev", data_attr = #cuf.cuda, uniq_name = "_QFcallkernelEadev"} -> !fir.ref>>>> + %7 = fir.declare %3 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFcallkernelEadev"} : (!fir.ref>>>>) -> !fir.ref>>>> + %8 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFcallkernelEb"} : (!fir.ref, !fir.dscope) -> !fir.ref + %9 = fir.declare %arg2 dummy_scope %0 {uniq_name = "_QFcallkernelEc"} : (!fir.ref, !fir.dscope) -> !fir.ref + %10 = fir.alloca i32 {bindc_name = "m", uniq_name = "_QFcallkernelEm"} + %11 = fir.declare %10 {uniq_name = "_QFcallkernelEm"} : (!fir.ref) -> !fir.ref + %12 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFcallkernelEn"} + %13 = fir.declare %12 {uniq_name = "_QFcallkernelEn"} : (!fir.ref) -> !fir.ref + %14:3 = fir.box_dims %2, %c0 : (!fir.box>>, index) -> (index, index, index) + %15 = fir.convert %14#1 : (index) -> i32 + fir.store %15 to %13 : !fir.ref + %16:3 = fir.box_dims %2, %c1 : (!fir.box>>, index) -> (index, index, index) + %27 = fir.load %13 : !fir.ref + %28 = fir.convert %27 : (i32) -> index + %29 = arith.cmpi sgt, %28, %c0 : index + %30 = arith.select %29, %28, %c0 : index + %31 = fir.load %11 : !fir.ref + %32 = fir.convert %31 : (i32) -> index + %33 = arith.cmpi sgt, %32, %c0 : index + %34 = arith.select %33, %32, %c0 : index + %35 = fir.shape %30, %34 : (index, index) -> !fir.shape<2> + %36 = fir.undefined index + %37 = fir.slice %c1, %28, %c1, %c1, %32, %c1 : (index, index, index, index, index, index) -> !fir.slice<2> + %38 = fir.rebox %2 [%37] : (!fir.box>>, !fir.slice<2>) -> !fir.box>> + cuf.data_transfer %38 to %7 {transfer_kind = #cuf.cuda_transfer} : !fir.box>>, !fir.ref>>>> + return +} + +// CHECK-LABEL: func.func @_QPcallkernel( +// CHECK-SAME: %[[ARG0:.*]]: !fir.box>> {fir.bindc_name = "a"} +// CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.box>> +// CHECK: %[[DECL_ARG0:.*]] = fir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFcallkernelEa"} : (!fir.box>>, !fir.dscope) -> !fir.box>> +// CHECK: %[[REBOX0:.*]] = fir.rebox %[[DECL_ARG0]] : (!fir.box>>) -> !fir.box>> +// CHECK: %[[REBOX1:.*]] = fir.rebox %[[REBOX0]] [%{{.*}}] : (!fir.box>>, !fir.slice<2>) -> !fir.box>> +// CHECK: fir.store %[[REBOX1]] to %[[ALLOCA]] : !fir.ref>>> +// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ALLOCA]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none + } // end of module From 9161e6ab745adeef67a129b4e1b6724f026125f0 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Mon, 18 Nov 2024 09:39:13 -0800 Subject: [PATCH 008/366] [SandboxIR] Add debug checker to compare IR before/after a revert (#115968) This will help us catch mistakes in change tracking. It's only enabled when EXPENSIVE_CHECKS are enabled. --- llvm/include/llvm/SandboxIR/Context.h | 11 ++-- llvm/include/llvm/SandboxIR/Instruction.h | 1 + llvm/include/llvm/SandboxIR/Tracker.h | 66 ++++++++++++++++++-- llvm/lib/SandboxIR/Tracker.cpp | 73 ++++++++++++++++++++++- llvm/unittests/SandboxIR/TrackerTest.cpp | 63 +++++++++++++++++++ 5 files changed, 204 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index f2056de87cb94..b0d6f8335d9e0 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -44,11 +44,12 @@ class Context { protected: LLVMContext &LLVMCtx; - friend class Type; // For LLVMCtx. - friend class PointerType; // For LLVMCtx. - friend class IntegerType; // For LLVMCtx. - friend class StructType; // For LLVMCtx. - friend class Region; // For LLVMCtx. + friend class Type; // For LLVMCtx. + friend class PointerType; // For LLVMCtx. + friend class IntegerType; // For LLVMCtx. + friend class StructType; // For LLVMCtx. + friend class Region; // For LLVMCtx. + friend class IRSnapshotChecker; // To snapshot LLVMModuleToModuleMap. Tracker IRTracker; diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index d9642365908d2..2a59d72e28552 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -11,6 +11,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/SandboxIR/BasicBlock.h" #include "llvm/SandboxIR/Constant.h" diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index dab20eb809ba0..9a031f3270837 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -42,13 +42,12 @@ #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" -#include "llvm/IR/Module.h" #include "llvm/SandboxIR/Use.h" #include "llvm/Support/Debug.h" #include -#include namespace llvm::sandboxir { @@ -64,9 +63,56 @@ class SwitchInst; class ConstantInt; class ShuffleVectorInst; class CmpInst; -class Module; class GlobalVariable; +#ifndef NDEBUG + +/// A class that saves hashes and textual IR snapshots of functions in a +/// SandboxIR Context, and does hash comparison when `expectNoDiff` is called. +/// If hashes differ, it prints textual IR for both old and new versions to +/// aid debugging. +/// +/// This is used as an additional debug check when reverting changes to +/// SandboxIR, to verify the reverted state matches the initial state. +class IRSnapshotChecker { + Context &Ctx; + + // A snapshot of textual IR for a function, with a hash for quick comparison. + struct FunctionSnapshot { + llvm::stable_hash Hash; + std::string TextualIR; + }; + + // A snapshot for each llvm::Function found in every module in the SandboxIR + // Context. In practice there will always be one module, but sandbox IR + // save/restore ops work at the Context level, so we must take the full state + // into account. + using ContextSnapshot = DenseMap; + + ContextSnapshot OrigContextSnapshot; + + // Dumps to a string the textual IR for a single Function. + std::string dumpIR(const llvm::Function &F) const; + + // Returns a snapshot of all the modules in the sandbox IR context. + ContextSnapshot takeSnapshot() const; + + // Compares two snapshots and returns true if they differ. + bool diff(const ContextSnapshot &Orig, const ContextSnapshot &Curr) const; + +public: + IRSnapshotChecker(Context &Ctx) : Ctx(Ctx) {} + + /// Saves a snapshot of the current state. If there was any previous snapshot, + /// it will be replaced with the new one. + void save(); + + /// Checks current state against saved state, crashes if different. + void expectNoDiff(); +}; + +#endif // NDEBUG + /// The base class for IR Change classes. class IRChangeBase { protected: @@ -405,6 +451,10 @@ class Tracker { TrackerState State = TrackerState::Disabled; Context &Ctx; +#ifndef NDEBUG + IRSnapshotChecker SnapshotChecker; +#endif + public: #ifndef NDEBUG /// Helps catch bugs where we are creating new change objects while in the @@ -412,7 +462,15 @@ class Tracker { bool InMiddleOfCreatingChange = false; #endif // NDEBUG - explicit Tracker(Context &Ctx) : Ctx(Ctx) {} + explicit Tracker(Context &Ctx) + : Ctx(Ctx) +#ifndef NDEBUG + , + SnapshotChecker(Ctx) +#endif + { + } + ~Tracker(); Context &getContext() const { return Ctx; } /// Record \p Change and take ownership. This is the main function used to diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index d35e3ba84990f..27ed37aa9bdd3 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -10,12 +10,75 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/StructuralHash.h" #include "llvm/SandboxIR/Instruction.h" #include using namespace llvm::sandboxir; #ifndef NDEBUG + +std::string IRSnapshotChecker::dumpIR(const llvm::Function &F) const { + std::string Result; + raw_string_ostream SS(Result); + F.print(SS, /*AssemblyAnnotationWriter=*/nullptr); + return Result; +} + +IRSnapshotChecker::ContextSnapshot IRSnapshotChecker::takeSnapshot() const { + ContextSnapshot Result; + for (const auto &Entry : Ctx.LLVMModuleToModuleMap) + for (const auto &F : *Entry.first) { + FunctionSnapshot Snapshot; + Snapshot.Hash = StructuralHash(F, /*DetailedHash=*/true); + Snapshot.TextualIR = dumpIR(F); + Result[&F] = Snapshot; + } + return Result; +} + +bool IRSnapshotChecker::diff(const ContextSnapshot &Orig, + const ContextSnapshot &Curr) const { + bool DifferenceFound = false; + for (const auto &[F, OrigFS] : Orig) { + auto CurrFSIt = Curr.find(F); + if (CurrFSIt == Curr.end()) { + DifferenceFound = true; + dbgs() << "Function " << F->getName() << " not found in current IR.\n"; + dbgs() << OrigFS.TextualIR << "\n"; + continue; + } + const FunctionSnapshot &CurrFS = CurrFSIt->second; + if (OrigFS.Hash != CurrFS.Hash) { + DifferenceFound = true; + dbgs() << "Found IR difference in Function " << F->getName() << "\n"; + dbgs() << "Original:\n" << OrigFS.TextualIR << "\n"; + dbgs() << "Current:\n" << CurrFS.TextualIR << "\n"; + } + } + // Check that Curr doesn't contain any new functions. + for (const auto &[F, CurrFS] : Curr) { + if (!Orig.contains(F)) { + DifferenceFound = true; + dbgs() << "Function " << F->getName() + << " found in current IR but not in original snapshot.\n"; + dbgs() << CurrFS.TextualIR << "\n"; + } + } + return DifferenceFound; +} + +void IRSnapshotChecker::save() { OrigContextSnapshot = takeSnapshot(); } + +void IRSnapshotChecker::expectNoDiff() { + ContextSnapshot CurrContextSnapshot = takeSnapshot(); + if (diff(OrigContextSnapshot, CurrContextSnapshot)) { + llvm_unreachable( + "Original and current IR differ! Probably a checkpointing bug."); + } +} + void UseSet::dump() const { dump(dbgs()); dbgs() << "\n"; @@ -275,7 +338,12 @@ void CmpSwapOperands::dump() const { } #endif -void Tracker::save() { State = TrackerState::Record; } +void Tracker::save() { + State = TrackerState::Record; +#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS) + SnapshotChecker.save(); +#endif +} void Tracker::revert() { assert(State == TrackerState::Record && "Forgot to save()!"); @@ -283,6 +351,9 @@ void Tracker::revert() { for (auto &Change : reverse(Changes)) Change->revert(*this); Changes.clear(); +#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS) + SnapshotChecker.expectNoDiff(); +#endif } void Tracker::accept() { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index 4f2cfa6b06ecd..cee13222179dc 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -1844,3 +1844,66 @@ define void @foo(i32 %arg, float %farg) { Ctx.revert(); EXPECT_FALSE(FAdd->getFastMathFlags() != OrigFMF); } + +TEST_F(TrackerTest, IRSnapshotCheckerNoChanges) { + parseIR(C, R"IR( +define i32 @foo(i32 %arg) { + %add0 = add i32 %arg, %arg + ret i32 %add0 +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + [[maybe_unused]] auto *F = Ctx.createFunction(&LLVMF); + sandboxir::IRSnapshotChecker Checker(Ctx); + Checker.save(); + Checker.expectNoDiff(); +} + +TEST_F(TrackerTest, IRSnapshotCheckerDiesWithUnexpectedChanges) { + parseIR(C, R"IR( +define i32 @foo(i32 %arg) { + %add0 = add i32 %arg, %arg + %add1 = add i32 %add0, %arg + ret i32 %add1 +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto *F = Ctx.createFunction(&LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + sandboxir::Instruction *Add0 = &*It++; + sandboxir::Instruction *Add1 = &*It++; + sandboxir::IRSnapshotChecker Checker(Ctx); + Checker.save(); + Add1->setOperand(1, Add0); + EXPECT_DEATH(Checker.expectNoDiff(), "Found IR difference"); +} + +TEST_F(TrackerTest, IRSnapshotCheckerSaveMultipleTimes) { + parseIR(C, R"IR( +define i32 @foo(i32 %arg) { + %add0 = add i32 %arg, %arg + %add1 = add i32 %add0, %arg + ret i32 %add1 +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto *F = Ctx.createFunction(&LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + sandboxir::Instruction *Add0 = &*It++; + sandboxir::Instruction *Add1 = &*It++; + sandboxir::IRSnapshotChecker Checker(Ctx); + Checker.save(); + Add1->setOperand(1, Add0); + // Now IR differs from the last snapshot. Let's take a new snapshot. + Checker.save(); + // The new snapshot should have replaced the old one, so this should succeed. + Checker.expectNoDiff(); +} From 4615cc38f35d111f09073f51cc734e29c9211067 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Mon, 18 Nov 2024 17:45:58 +0000 Subject: [PATCH 009/366] [RISCV] Inline Assembly Support for GPR Pairs ('R') (#112983) This patch adds support for getting even-odd general purpose register pairs into and out of inline assembly using the `R` constraint as proposed in riscv-non-isa/riscv-c-api-doc#92 There are a few different pieces to this patch, each of which need their own explanation. - Renames the Register Class used for f64 values on rv32i_zdinx from `GPRPair*` to `GPRF64Pair*`. These register classes are kept broadly unmodified, as their primary value type is used for type inference over selection patterns. This rename affects quite a lot of files. - Adds new `GPRPair*` register classes which will be used for `R` constraints and for instructions that need an even-odd GPR pair. This new type is used for `amocas.d.*`(rv32) and `amocas.q.*`(rv64) in Zacas, instead of the `GPRF64Pair` class being used before. - Marks the new `GPRPair` class legal as for holding a `MVT::Untyped`. Two new RISCVISD node types are added for creating and destructing a pair - `BuildGPRPair` and `SplitGPRPair`, and are introduced when bitcasting to/from the pair type and `untyped`. - Adds functionality to `splitValueIntoRegisterParts` and `joinRegisterPartsIntoValue` to handle changing `i<2*xlen>` MVTs into `untyped` pairs. - Adds an override for `getNumRegisters` to ensure that `i<2*xlen>` values, when going to/from inline assembly, only allocate one (pair) register (they would otherwise allocate two). This is due to a bug in SelectionDAGBuilder.cpp which other backends also work around. - Ensures that Clang understands that `R` is a valid inline assembly constraint. - This also allows `R` to be used for `f64` types on `rv32_zdinx` architectures, where doubles are stored in a GPR pair. --- clang/lib/Basic/Targets/RISCV.cpp | 4 + clang/test/CodeGen/RISCV/riscv-inline-asm.c | 13 ++++ .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 22 ++++-- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 24 ++++-- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 51 +++++++++++-- llvm/lib/Target/RISCV/RISCVISelLowering.h | 17 +++++ llvm/lib/Target/RISCV/RISCVInstrInfoD.td | 12 +-- llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 23 +++++- .../CodeGen/RISCV/rv32-inline-asm-pairs.ll | 73 +++++++++++++++++++ .../CodeGen/RISCV/rv64-inline-asm-pairs.ll | 73 +++++++++++++++++++ .../CodeGen/RISCV/zdinx-asm-constraint.ll | 28 ++++++- 11 files changed, 310 insertions(+), 30 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll create mode 100644 llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index eaaba7642bd7b..c61ee7ee20392 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -108,6 +108,10 @@ bool RISCVTargetInfo::validateAsmConstraint( return true; } return false; + case 'R': + // An even-odd GPR pair + Info.setAllowsRegister(); + return true; case 'v': // A vector register. if (Name[1] == 'r' || Name[1] == 'd' || Name[1] == 'm') { diff --git a/clang/test/CodeGen/RISCV/riscv-inline-asm.c b/clang/test/CodeGen/RISCV/riscv-inline-asm.c index 75b91d3c497c5..de90e513ea1ff 100644 --- a/clang/test/CodeGen/RISCV/riscv-inline-asm.c +++ b/clang/test/CodeGen/RISCV/riscv-inline-asm.c @@ -33,6 +33,19 @@ void test_cf(float f, double d) { asm volatile("" : "=cf"(cd) : "cf"(d)); } +#if __riscv_xlen == 32 +typedef long long double_xlen_t; +#elif __riscv_xlen == 64 +typedef __int128_t double_xlen_t; +#endif +double_xlen_t test_R_wide_scalar(double_xlen_t p) { +// CHECK-LABEL: define{{.*}} {{i128|i64}} @test_R_wide_scalar( +// CHECK: call {{i128|i64}} asm sideeffect "", "=R,R"({{i128|i64}} %{{.*}}) + double_xlen_t ret; + asm volatile("" : "=R"(ret) : "R"(p)); + return ret; +} + void test_I(void) { // CHECK-LABEL: define{{.*}} void @test_I() // CHECK: call void asm sideeffect "", "I"(i32 2047) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 4d46afb8c4ef9..1b23b36a59e0e 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -481,6 +481,12 @@ struct RISCVOperand final : public MCParsedAsmOperand { RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum); } + bool isGPRPair() const { + return Kind == KindTy::Register && + RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains( + Reg.RegNum); + } + bool isGPRF16() const { return Kind == KindTy::Register && RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.RegNum); @@ -491,17 +497,17 @@ struct RISCVOperand final : public MCParsedAsmOperand { RISCVMCRegisterClasses[RISCV::GPRF32RegClassID].contains(Reg.RegNum); } - bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } - bool isGPRAsFPR16() const { return isGPRF16() && Reg.IsGPRAsFPR; } - bool isGPRAsFPR32() const { return isGPRF32() && Reg.IsGPRAsFPR; } - bool isGPRPairAsFPR() const { return isGPRPair() && Reg.IsGPRAsFPR; } - - bool isGPRPair() const { + bool isGPRF64Pair() const { return Kind == KindTy::Register && - RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains( + RISCVMCRegisterClasses[RISCV::GPRF64PairRegClassID].contains( Reg.RegNum); } + bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } + bool isGPRAsFPR16() const { return isGPRF16() && Reg.IsGPRAsFPR; } + bool isGPRAsFPR32() const { return isGPRF32() && Reg.IsGPRAsFPR; } + bool isGPRPairAsFPR64() const { return isGPRF64Pair() && Reg.IsGPRAsFPR; } + static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm, RISCVMCExpr::VariantKind &VK) { if (auto *RE = dyn_cast(Expr)) { @@ -2399,7 +2405,7 @@ ParseStatus RISCVAsmParser::parseGPRPairAsFPR64(OperandVector &Operands) { const MCRegisterInfo *RI = getContext().getRegisterInfo(); MCRegister Pair = RI->getMatchingSuperReg( Reg, RISCV::sub_gpr_even, - &RISCVMCRegisterClasses[RISCV::GPRPairRegClassID]); + &RISCVMCRegisterClasses[RISCV::GPRF64PairRegClassID]); Operands.push_back(RISCVOperand::createReg(Pair, S, E, /*isGPRAsFPR=*/true)); return ParseStatus::Success; } diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index a1b74faf17fab..034314c88f79f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -952,27 +952,36 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, Res); return; } + case RISCVISD::BuildGPRPair: case RISCVISD::BuildPairF64: { - if (!Subtarget->hasStdExtZdinx()) + if (Opcode == RISCVISD::BuildPairF64 && !Subtarget->hasStdExtZdinx()) break; - assert(!Subtarget->is64Bit() && "Unexpected subtarget"); + assert((!Subtarget->is64Bit() || Opcode == RISCVISD::BuildGPRPair) && + "BuildPairF64 only handled here on rv32i_zdinx"); + + int RegClassID = (Opcode == RISCVISD::BuildGPRPair) + ? RISCV::GPRPairRegClassID + : RISCV::GPRF64PairRegClassID; + MVT OutType = (Opcode == RISCVISD::BuildGPRPair) ? MVT::Untyped : MVT::f64; SDValue Ops[] = { - CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32), + CurDAG->getTargetConstant(RegClassID, DL, MVT::i32), Node->getOperand(0), CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32), Node->getOperand(1), CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)}; SDNode *N = - CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::f64, Ops); + CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, OutType, Ops); ReplaceNode(Node, N); return; } + case RISCVISD::SplitGPRPair: case RISCVISD::SplitF64: { - if (Subtarget->hasStdExtZdinx()) { - assert(!Subtarget->is64Bit() && "Unexpected subtarget"); + if (Subtarget->hasStdExtZdinx() || Opcode != RISCVISD::SplitF64) { + assert((!Subtarget->is64Bit() || Opcode == RISCVISD::SplitGPRPair) && + "SplitF64 only handled here on rv32i_zdinx"); if (!SDValue(Node, 0).use_empty()) { SDValue Lo = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_even, DL, VT, @@ -990,6 +999,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } + assert(Opcode != RISCVISD::SplitGPRPair && + "SplitGPRPair should already be handled"); + if (!Subtarget->hasStdExtZfa()) break; assert(Subtarget->hasStdExtD() && !Subtarget->is64Bit() && diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5f970ffe671c6..60fc024f0d274 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -133,7 +133,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.is64Bit()) addRegisterClass(MVT::f64, &RISCV::GPRRegClass); else - addRegisterClass(MVT::f64, &RISCV::GPRPairRegClass); + addRegisterClass(MVT::f64, &RISCV::GPRF64PairRegClass); } static const MVT::SimpleValueType BoolVecVTs[] = { @@ -2233,6 +2233,17 @@ MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, return PartVT; } +unsigned +RISCVTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT, + std::optional RegisterVT) const { + // Pair inline assembly operand + if (VT == (Subtarget.is64Bit() ? MVT::i128 : MVT::i64) && RegisterVT && + *RegisterVT == MVT::Untyped) + return 1; + + return TargetLowering::getNumRegisters(Context, VT, RegisterVT); +} + unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { @@ -20196,6 +20207,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TAIL) NODE_NAME_CASE(SELECT_CC) NODE_NAME_CASE(BR_CC) + NODE_NAME_CASE(BuildGPRPair) + NODE_NAME_CASE(SplitGPRPair) NODE_NAME_CASE(BuildPairF64) NODE_NAME_CASE(SplitF64) NODE_NAME_CASE(ADD_LO) @@ -20456,6 +20469,7 @@ RISCVTargetLowering::getConstraintType(StringRef Constraint) const { default: break; case 'f': + case 'R': return C_RegisterClass; case 'I': case 'J': @@ -20493,7 +20507,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VT == MVT::f32 && Subtarget.hasStdExtZfinx()) return std::make_pair(0U, &RISCV::GPRF32NoX0RegClass); if (VT == MVT::f64 && Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit()) - return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass); + return std::make_pair(0U, &RISCV::GPRF64PairNoX0RegClass); return std::make_pair(0U, &RISCV::GPRNoX0RegClass); case 'f': if (VT == MVT::f16) { @@ -20510,11 +20524,15 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Subtarget.hasStdExtD()) return std::make_pair(0U, &RISCV::FPR64RegClass); if (Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit()) - return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass); + return std::make_pair(0U, &RISCV::GPRF64PairNoX0RegClass); if (Subtarget.hasStdExtZdinx() && Subtarget.is64Bit()) return std::make_pair(0U, &RISCV::GPRNoX0RegClass); } break; + case 'R': + if (VT == MVT::f64 && !Subtarget.is64Bit() && Subtarget.hasStdExtZdinx()) + return std::make_pair(0U, &RISCV::GPRF64PairCRegClass); + return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass); default: break; } @@ -20552,7 +20570,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VT == MVT::f32 && Subtarget.hasStdExtZfinx()) return std::make_pair(0U, &RISCV::GPRF32CRegClass); if (VT == MVT::f64 && Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit()) - return std::make_pair(0U, &RISCV::GPRPairCRegClass); + return std::make_pair(0U, &RISCV::GPRF64PairCRegClass); if (!VT.isVector()) return std::make_pair(0U, &RISCV::GPRCRegClass); } else if (Constraint == "cf") { @@ -20570,7 +20588,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Subtarget.hasStdExtD()) return std::make_pair(0U, &RISCV::FPR64CRegClass); if (Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit()) - return std::make_pair(0U, &RISCV::GPRPairCRegClass); + return std::make_pair(0U, &RISCV::GPRF64PairCRegClass); if (Subtarget.hasStdExtZdinx() && Subtarget.is64Bit()) return std::make_pair(0U, &RISCV::GPRCRegClass); } @@ -20734,7 +20752,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Subtarget into account. if (Res.second == &RISCV::GPRF16RegClass || Res.second == &RISCV::GPRF32RegClass || - Res.second == &RISCV::GPRPairRegClass) + Res.second == &RISCV::GPRF64PairRegClass) return std::make_pair(Res.first, &RISCV::GPRRegClass); return Res; @@ -21360,6 +21378,16 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts( unsigned NumParts, MVT PartVT, std::optional CC) const { bool IsABIRegCopy = CC.has_value(); EVT ValueVT = Val.getValueType(); + + if (ValueVT == (Subtarget.is64Bit() ? MVT::i128 : MVT::i64) && + NumParts == 1 && PartVT == MVT::Untyped) { + // Pairs in Inline Assembly + MVT XLenVT = Subtarget.getXLenVT(); + auto [Lo, Hi] = DAG.SplitScalar(Val, DL, XLenVT, XLenVT); + Parts[0] = DAG.getNode(RISCVISD::BuildGPRPair, DL, MVT::Untyped, Lo, Hi); + return true; + } + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) { // Cast the [b]f16 to i16, extend to i32, pad with ones to make a float @@ -21436,6 +21464,17 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue( SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, std::optional CC) const { bool IsABIRegCopy = CC.has_value(); + + if (ValueVT == (Subtarget.is64Bit() ? MVT::i128 : MVT::i64) && + NumParts == 1 && PartVT == MVT::Untyped) { + // Pairs in Inline Assembly + MVT XLenVT = Subtarget.getXLenVT(); + SDValue Res = DAG.getNode(RISCVISD::SplitGPRPair, DL, + DAG.getVTList(XLenVT, XLenVT), Parts[0]); + return DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Res.getValue(0), + Res.getValue(1)); + } + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) { SDValue Val = Parts[0]; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 9ae70d257fa44..773729d69a143 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -44,6 +44,18 @@ enum NodeType : unsigned { SELECT_CC, BR_CC, + /// Turn a pair of `i`s into an even-odd register pair (`untyped`). + /// - Output: `untyped` even-odd register pair + /// - Input 0: `i` low-order bits, for even register. + /// - Input 1: `i` high-order bits, for odd register. + BuildGPRPair, + + /// Turn an even-odd register pair (`untyped`) into a pair of `i`s. + /// - Output 0: `i` low-order bits, from even register. + /// - Output 1: `i` high-order bits, from odd register. + /// - Input: `untyped` even-odd register pair + SplitGPRPair, + /// Turns a pair of `i32`s into an `f64`. Needed for rv32d/ilp32. /// - Output: `f64`. /// - Input 0: low-order bits (31-0) (as `i32`), for even register. @@ -547,6 +559,11 @@ class RISCVTargetLowering : public TargetLowering { MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; + /// Return the number of registers for a given MVT, for inline assembly + unsigned + getNumRegisters(LLVMContext &Context, EVT VT, + std::optional RegisterVT = std::nullopt) const override; + /// Return the number of registers for a given MVT, ensuring vectors are /// treated as a series of gpr sized integers. unsigned getNumRegistersForCallingConv(LLVMContext &Context, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 0de43c458f22c..3c043c3d3864b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -36,7 +36,7 @@ def AddrRegImmINX : ComplexPattern; def GPRPairAsFPR : AsmOperandClass { let Name = "GPRPairAsFPR"; let ParserMethod = "parseGPRPairAsFPR64"; - let PredicateMethod = "isGPRPairAsFPR"; + let PredicateMethod = "isGPRPairAsFPR64"; let RenderMethod = "addRegOperands"; } @@ -52,7 +52,7 @@ def FPR64INX : RegisterOperand { let DecoderMethod = "DecodeGPRRegisterClass"; } -def FPR64IN32X : RegisterOperand { +def FPR64IN32X : RegisterOperand { let ParserMatchClass = GPRPairAsFPR; } @@ -523,15 +523,15 @@ def PseudoFROUND_D_IN32X : PseudoFROUND; /// Loads let isCall = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 1 in -def PseudoRV32ZdinxLD : Pseudo<(outs GPRPair:$dst), (ins GPR:$rs1, simm12:$imm12), []>; +def PseudoRV32ZdinxLD : Pseudo<(outs GPRF64Pair:$dst), (ins GPR:$rs1, simm12:$imm12), []>; def : Pat<(f64 (load (AddrRegImmINX (XLenVT GPR:$rs1), simm12:$imm12))), (PseudoRV32ZdinxLD GPR:$rs1, simm12:$imm12)>; /// Stores let isCall = 0, mayLoad = 0, mayStore = 1, Size = 8, isCodeGenOnly = 1 in -def PseudoRV32ZdinxSD : Pseudo<(outs), (ins GPRPair:$rs2, GPRNoX0:$rs1, simm12:$imm12), []>; -def : Pat<(store (f64 GPRPair:$rs2), (AddrRegImmINX (XLenVT GPR:$rs1), simm12:$imm12)), - (PseudoRV32ZdinxSD GPRPair:$rs2, GPR:$rs1, simm12:$imm12)>; +def PseudoRV32ZdinxSD : Pseudo<(outs), (ins GPRF64Pair:$rs2, GPRNoX0:$rs1, simm12:$imm12), []>; +def : Pat<(store (f64 GPRF64Pair:$rs2), (AddrRegImmINX (XLenVT GPR:$rs1), simm12:$imm12)), + (PseudoRV32ZdinxSD GPRF64Pair:$rs2, GPR:$rs1, simm12:$imm12)>; } // Predicates = [HasStdExtZdinx, IsRV32] let Predicates = [HasStdExtD, IsRV32] in { diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 803c3ec195106..e0687b90ad17f 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -208,6 +208,8 @@ let RegAltNameIndices = [ABIRegAltName] in { def XLenVT : ValueTypeByHwMode<[RV32, RV64], [i32, i64]>; +defvar XLenPairVT = untyped; + // Allow f64 in GPR for ZDINX on RV64. def XLenFVT : ValueTypeByHwMode<[RV64], [f64]>; @@ -323,7 +325,7 @@ let RegAltNameIndices = [ABIRegAltName] in { let RegInfos = XLenPairRI, DecoderMethod = "DecodeGPRPairRegisterClass" in { -def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add +def GPRPair : RISCVRegisterClass<[XLenPairVT], 64, (add X10_X11, X12_X13, X14_X15, X16_X17, X6_X7, X28_X29, X30_X31, @@ -332,11 +334,11 @@ def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add X0_Pair, X2_X3, X4_X5 )>; -def GPRPairNoX0 : RISCVRegisterClass<[XLenPairFVT], 64, (sub GPRPair, X0_Pair)>; +def GPRPairNoX0 : RISCVRegisterClass<[XLenPairVT], 64, (sub GPRPair, X0_Pair)>; } // let RegInfos = XLenPairRI, DecoderMethod = "DecodeGPRPairRegisterClass" let RegInfos = XLenPairRI in -def GPRPairC : RISCVRegisterClass<[XLenPairFVT], 64, (add +def GPRPairC : RISCVRegisterClass<[XLenPairVT], 64, (add X10_X11, X12_X13, X14_X15, X8_X9 )>; @@ -462,6 +464,21 @@ def GPRF32C : RISCVRegisterClass<[f32], 32, (add (sequence "X%u_W", 10, 15), (sequence "X%u_W", 8, 9))>; def GPRF32NoX0 : RISCVRegisterClass<[f32], 32, (sub GPRF32, X0_W)>; +let DecoderMethod = "DecodeGPRPairRegisterClass" in +def GPRF64Pair : RISCVRegisterClass<[XLenPairFVT], 64, (add + X10_X11, X12_X13, X14_X15, X16_X17, + X6_X7, + X28_X29, X30_X31, + X8_X9, + X18_X19, X20_X21, X22_X23, X24_X25, X26_X27, + X0_Pair, X2_X3, X4_X5 +)>; + +def GPRF64PairC : RISCVRegisterClass<[XLenPairFVT], 64, (add + X10_X11, X12_X13, X14_X15, X8_X9 +)>; + +def GPRF64PairNoX0 : RISCVRegisterClass<[XLenPairFVT], 64, (sub GPRF64Pair, X0_Pair)>; //===----------------------------------------------------------------------===// // Vector type mapping to LLVM types. diff --git a/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll b/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll new file mode 100644 index 0000000000000..04a5d268aebff --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define i64 @test_Pr_wide_scalar_simple(i64 noundef %0) nounwind { +; CHECK-LABEL: test_Pr_wide_scalar_simple: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: # a2 <- a0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: ret +entry: + %1 = call i64 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i64 %0) + ret i64 %1 +} + +define i32 @test_Pr_wide_scalar_with_ops(i32 noundef %0) nounwind { +; CHECK-LABEL: test_Pr_wide_scalar_with_ops: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: #APP +; CHECK-NEXT: # a2 <- a0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: or a0, a2, a3 +; CHECK-NEXT: ret +entry: + %1 = zext i32 %0 to i64 + %2 = shl i64 %1, 32 + %3 = or i64 %1, %2 + %4 = call i64 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i64 %3) + %5 = trunc i64 %4 to i32 + %6 = lshr i64 %4, 32 + %7 = trunc i64 %6 to i32 + %8 = or i32 %5, %7 + ret i32 %8 +} + +define i64 @test_Pr_wide_scalar_inout(ptr %0, i64 noundef %1) nounwind { +; CHECK-LABEL: test_Pr_wide_scalar_inout: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: sw a0, 12(sp) +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: sw a1, 0(sp) +; CHECK-NEXT: sw a3, 4(sp) +; CHECK-NEXT: #APP +; CHECK-NEXT: # a0; a2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: sw a0, 12(sp) +; CHECK-NEXT: sw a2, 0(sp) +; CHECK-NEXT: sw a3, 4(sp) +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +entry: + %2 = alloca ptr, align 4 + %3 = alloca i64, align 8 + store ptr %0, ptr %2, align 4 + store i64 %1, ptr %3, align 8 + %4 = load ptr, ptr %2, align 4 + %5 = load i64, ptr %3, align 8 + %6 = call { ptr, i64 } asm sideeffect "/* $0; $1 */", "=r,=R,0,1"(ptr %4, i64 %5) + %7 = extractvalue { ptr, i64} %6, 0 + %8 = extractvalue { ptr, i64 } %6, 1 + store ptr %7, ptr %2, align 4 + store i64 %8, ptr %3, align 8 + %9 = load i64, ptr %3, align 8 + ret i64 %9 +} diff --git a/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll b/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll new file mode 100644 index 0000000000000..41f353d0781ae --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define i128 @test_R_wide_scalar_simple(i128 noundef %0) nounwind { +; CHECK-LABEL: test_R_wide_scalar_simple: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: # a2 <- a0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: ret +entry: + %1 = call i128 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i128 %0) + ret i128 %1 +} + +define i64 @test_R_wide_scalar_with_ops(i64 noundef %0) nounwind { +; CHECK-LABEL: test_R_wide_scalar_with_ops: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: #APP +; CHECK-NEXT: # a2 <- a0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: or a0, a2, a3 +; CHECK-NEXT: ret +entry: + %1 = zext i64 %0 to i128 + %2 = shl i128 %1, 64 + %3 = or i128 %1, %2 + %4 = call i128 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i128 %3) + %5 = trunc i128 %4 to i64 + %6 = lshr i128 %4, 64 + %7 = trunc i128 %6 to i64 + %8 = or i64 %5, %7 + ret i64 %8 +} + +define i128 @test_R_wide_scalar_inout(ptr %0, i128 noundef %1) nounwind { +; CHECK-LABEL: test_R_wide_scalar_inout: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: sd a1, 0(sp) +; CHECK-NEXT: sd a3, 8(sp) +; CHECK-NEXT: #APP +; CHECK-NEXT: # a0; a2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd a2, 0(sp) +; CHECK-NEXT: sd a3, 8(sp) +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: ret +entry: + %2 = alloca ptr, align 8 + %3 = alloca i128, align 16 + store ptr %0, ptr %2, align 8 + store i128 %1, ptr %3, align 16 + %4 = load ptr, ptr %2, align 8 + %5 = load i128, ptr %3, align 16 + %6 = call { ptr, i128 } asm sideeffect "/* $0; $1 */", "=r,=R,0,1"(ptr %4, i128 %5) + %7 = extractvalue { ptr, i128} %6, 0 + %8 = extractvalue { ptr, i128 } %6, 1 + store ptr %7, ptr %2, align 8 + store i128 %8, ptr %3, align 16 + %9 = load i128, ptr %3, align 16 + ret i128 %9 +} diff --git a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll index 18bd41a210f53..aa2df4c61283f 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+zdinx -verify-machineinstrs < %s \ ; RUN: -target-abi=ilp32 -mattr=+zhinx | FileCheck %s -;; These tests cover the use of `r` and `cr` constraints for floating point values on rv32. +;; These tests cover the use of `r`, `R`, and `cr` constraints for floating point values on rv32. ;; ;; In particular, there is significant complexity around using paired GPRs for double values on rv32. @@ -26,6 +26,32 @@ entry: ret void } +define dso_local void @zdinx_asm_R(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind { +; CHECK-LABEL: zdinx_asm_R: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: mv s1, a2 +; CHECK-NEXT: mv a4, a3 +; CHECK-NEXT: mv s0, a1 +; CHECK-NEXT: #APP +; CHECK-NEXT: fsgnjx.d a2, s0, a4 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: sw a2, 8(a0) +; CHECK-NEXT: sw a3, 12(a0) +; CHECK-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +entry: + %arrayidx = getelementptr inbounds double, ptr %a, i32 1 + %0 = tail call double asm "fsgnjx.d $0, $1, $2", "=R,R,R"(double %b, double %c) + store double %0, ptr %arrayidx, align 8 + ret void +} + define dso_local void @zfinx_asm(ptr nocapture noundef writeonly %a, float noundef %b, float noundef %c) nounwind { ; CHECK-LABEL: zfinx_asm: ; CHECK: # %bb.0: # %entry From 0ae58c45330d7b66eabf3db2684aa53144c06063 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Mon, 18 Nov 2024 09:54:43 -0800 Subject: [PATCH 010/366] Revert "[SandboxIR] Add debug checker to compare IR before/after a revert" (#116666) Reverts llvm/llvm-project#115968. It caused buildbot failures. --- llvm/include/llvm/SandboxIR/Context.h | 11 ++-- llvm/include/llvm/SandboxIR/Instruction.h | 1 - llvm/include/llvm/SandboxIR/Tracker.h | 66 ++------------------ llvm/lib/SandboxIR/Tracker.cpp | 73 +---------------------- llvm/unittests/SandboxIR/TrackerTest.cpp | 63 ------------------- 5 files changed, 10 insertions(+), 204 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index b0d6f8335d9e0..f2056de87cb94 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -44,12 +44,11 @@ class Context { protected: LLVMContext &LLVMCtx; - friend class Type; // For LLVMCtx. - friend class PointerType; // For LLVMCtx. - friend class IntegerType; // For LLVMCtx. - friend class StructType; // For LLVMCtx. - friend class Region; // For LLVMCtx. - friend class IRSnapshotChecker; // To snapshot LLVMModuleToModuleMap. + friend class Type; // For LLVMCtx. + friend class PointerType; // For LLVMCtx. + friend class IntegerType; // For LLVMCtx. + friend class StructType; // For LLVMCtx. + friend class Region; // For LLVMCtx. Tracker IRTracker; diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index 2a59d72e28552..d9642365908d2 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -11,7 +11,6 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/SandboxIR/BasicBlock.h" #include "llvm/SandboxIR/Constant.h" diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index 9a031f3270837..dab20eb809ba0 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -42,12 +42,13 @@ #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StableHashing.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" #include "llvm/SandboxIR/Use.h" #include "llvm/Support/Debug.h" #include +#include namespace llvm::sandboxir { @@ -63,56 +64,9 @@ class SwitchInst; class ConstantInt; class ShuffleVectorInst; class CmpInst; +class Module; class GlobalVariable; -#ifndef NDEBUG - -/// A class that saves hashes and textual IR snapshots of functions in a -/// SandboxIR Context, and does hash comparison when `expectNoDiff` is called. -/// If hashes differ, it prints textual IR for both old and new versions to -/// aid debugging. -/// -/// This is used as an additional debug check when reverting changes to -/// SandboxIR, to verify the reverted state matches the initial state. -class IRSnapshotChecker { - Context &Ctx; - - // A snapshot of textual IR for a function, with a hash for quick comparison. - struct FunctionSnapshot { - llvm::stable_hash Hash; - std::string TextualIR; - }; - - // A snapshot for each llvm::Function found in every module in the SandboxIR - // Context. In practice there will always be one module, but sandbox IR - // save/restore ops work at the Context level, so we must take the full state - // into account. - using ContextSnapshot = DenseMap; - - ContextSnapshot OrigContextSnapshot; - - // Dumps to a string the textual IR for a single Function. - std::string dumpIR(const llvm::Function &F) const; - - // Returns a snapshot of all the modules in the sandbox IR context. - ContextSnapshot takeSnapshot() const; - - // Compares two snapshots and returns true if they differ. - bool diff(const ContextSnapshot &Orig, const ContextSnapshot &Curr) const; - -public: - IRSnapshotChecker(Context &Ctx) : Ctx(Ctx) {} - - /// Saves a snapshot of the current state. If there was any previous snapshot, - /// it will be replaced with the new one. - void save(); - - /// Checks current state against saved state, crashes if different. - void expectNoDiff(); -}; - -#endif // NDEBUG - /// The base class for IR Change classes. class IRChangeBase { protected: @@ -451,10 +405,6 @@ class Tracker { TrackerState State = TrackerState::Disabled; Context &Ctx; -#ifndef NDEBUG - IRSnapshotChecker SnapshotChecker; -#endif - public: #ifndef NDEBUG /// Helps catch bugs where we are creating new change objects while in the @@ -462,15 +412,7 @@ class Tracker { bool InMiddleOfCreatingChange = false; #endif // NDEBUG - explicit Tracker(Context &Ctx) - : Ctx(Ctx) -#ifndef NDEBUG - , - SnapshotChecker(Ctx) -#endif - { - } - + explicit Tracker(Context &Ctx) : Ctx(Ctx) {} ~Tracker(); Context &getContext() const { return Ctx; } /// Record \p Change and take ownership. This is the main function used to diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index 27ed37aa9bdd3..d35e3ba84990f 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -10,75 +10,12 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instruction.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/StructuralHash.h" #include "llvm/SandboxIR/Instruction.h" #include using namespace llvm::sandboxir; #ifndef NDEBUG - -std::string IRSnapshotChecker::dumpIR(const llvm::Function &F) const { - std::string Result; - raw_string_ostream SS(Result); - F.print(SS, /*AssemblyAnnotationWriter=*/nullptr); - return Result; -} - -IRSnapshotChecker::ContextSnapshot IRSnapshotChecker::takeSnapshot() const { - ContextSnapshot Result; - for (const auto &Entry : Ctx.LLVMModuleToModuleMap) - for (const auto &F : *Entry.first) { - FunctionSnapshot Snapshot; - Snapshot.Hash = StructuralHash(F, /*DetailedHash=*/true); - Snapshot.TextualIR = dumpIR(F); - Result[&F] = Snapshot; - } - return Result; -} - -bool IRSnapshotChecker::diff(const ContextSnapshot &Orig, - const ContextSnapshot &Curr) const { - bool DifferenceFound = false; - for (const auto &[F, OrigFS] : Orig) { - auto CurrFSIt = Curr.find(F); - if (CurrFSIt == Curr.end()) { - DifferenceFound = true; - dbgs() << "Function " << F->getName() << " not found in current IR.\n"; - dbgs() << OrigFS.TextualIR << "\n"; - continue; - } - const FunctionSnapshot &CurrFS = CurrFSIt->second; - if (OrigFS.Hash != CurrFS.Hash) { - DifferenceFound = true; - dbgs() << "Found IR difference in Function " << F->getName() << "\n"; - dbgs() << "Original:\n" << OrigFS.TextualIR << "\n"; - dbgs() << "Current:\n" << CurrFS.TextualIR << "\n"; - } - } - // Check that Curr doesn't contain any new functions. - for (const auto &[F, CurrFS] : Curr) { - if (!Orig.contains(F)) { - DifferenceFound = true; - dbgs() << "Function " << F->getName() - << " found in current IR but not in original snapshot.\n"; - dbgs() << CurrFS.TextualIR << "\n"; - } - } - return DifferenceFound; -} - -void IRSnapshotChecker::save() { OrigContextSnapshot = takeSnapshot(); } - -void IRSnapshotChecker::expectNoDiff() { - ContextSnapshot CurrContextSnapshot = takeSnapshot(); - if (diff(OrigContextSnapshot, CurrContextSnapshot)) { - llvm_unreachable( - "Original and current IR differ! Probably a checkpointing bug."); - } -} - void UseSet::dump() const { dump(dbgs()); dbgs() << "\n"; @@ -338,12 +275,7 @@ void CmpSwapOperands::dump() const { } #endif -void Tracker::save() { - State = TrackerState::Record; -#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS) - SnapshotChecker.save(); -#endif -} +void Tracker::save() { State = TrackerState::Record; } void Tracker::revert() { assert(State == TrackerState::Record && "Forgot to save()!"); @@ -351,9 +283,6 @@ void Tracker::revert() { for (auto &Change : reverse(Changes)) Change->revert(*this); Changes.clear(); -#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS) - SnapshotChecker.expectNoDiff(); -#endif } void Tracker::accept() { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index cee13222179dc..4f2cfa6b06ecd 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -1844,66 +1844,3 @@ define void @foo(i32 %arg, float %farg) { Ctx.revert(); EXPECT_FALSE(FAdd->getFastMathFlags() != OrigFMF); } - -TEST_F(TrackerTest, IRSnapshotCheckerNoChanges) { - parseIR(C, R"IR( -define i32 @foo(i32 %arg) { - %add0 = add i32 %arg, %arg - ret i32 %add0 -} -)IR"); - Function &LLVMF = *M->getFunction("foo"); - sandboxir::Context Ctx(C); - - [[maybe_unused]] auto *F = Ctx.createFunction(&LLVMF); - sandboxir::IRSnapshotChecker Checker(Ctx); - Checker.save(); - Checker.expectNoDiff(); -} - -TEST_F(TrackerTest, IRSnapshotCheckerDiesWithUnexpectedChanges) { - parseIR(C, R"IR( -define i32 @foo(i32 %arg) { - %add0 = add i32 %arg, %arg - %add1 = add i32 %add0, %arg - ret i32 %add1 -} -)IR"); - Function &LLVMF = *M->getFunction("foo"); - sandboxir::Context Ctx(C); - - auto *F = Ctx.createFunction(&LLVMF); - auto *BB = &*F->begin(); - auto It = BB->begin(); - sandboxir::Instruction *Add0 = &*It++; - sandboxir::Instruction *Add1 = &*It++; - sandboxir::IRSnapshotChecker Checker(Ctx); - Checker.save(); - Add1->setOperand(1, Add0); - EXPECT_DEATH(Checker.expectNoDiff(), "Found IR difference"); -} - -TEST_F(TrackerTest, IRSnapshotCheckerSaveMultipleTimes) { - parseIR(C, R"IR( -define i32 @foo(i32 %arg) { - %add0 = add i32 %arg, %arg - %add1 = add i32 %add0, %arg - ret i32 %add1 -} -)IR"); - Function &LLVMF = *M->getFunction("foo"); - sandboxir::Context Ctx(C); - - auto *F = Ctx.createFunction(&LLVMF); - auto *BB = &*F->begin(); - auto It = BB->begin(); - sandboxir::Instruction *Add0 = &*It++; - sandboxir::Instruction *Add1 = &*It++; - sandboxir::IRSnapshotChecker Checker(Ctx); - Checker.save(); - Add1->setOperand(1, Add0); - // Now IR differs from the last snapshot. Let's take a new snapshot. - Checker.save(); - // The new snapshot should have replaced the old one, so this should succeed. - Checker.expectNoDiff(); -} From 900c0565314618ec142b020cea1f9c86e2f8282b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 18 Nov 2024 10:07:20 -0800 Subject: [PATCH 011/366] [RISCV] Add an implementation of findRepresentativeClass to assign i32 to GPRRegClass for RV64. (#116165) This is an alternative fix for #81192. This allows the SelectionDAG scheduler to be able to find a representative register class for i32 on RV64. The representative register class is the super register class with the largest spill size that is also legal. The default implementation of findRepresentativeClass only works for legal types which i32 is not for RV64. I did some investigation of why tablegen uses i32 in output patterns on RV64. It appears it comes down to a function called ForceArbitraryInstResultType that picks a type for the output pattern when the isel pattern isn't specific enough. I believe it picks the smallest type(lowested numbered) to resolve the conflict. A similar issue occurs for f16 and bf16 which both use the FPR16 register class. If the isel pattern doesn't specify, tablegen may find both f16 and bf16 and may pick bf16 from Zfh pattern when Zfbfmin isn't present. Since bf16 isn't legal in that case, findRepresentativeClass will fail. For i8, i16, i32, this patch calls the base class with XLenVT to get the representative class since XLenVT is always legal. For bf16/f16, we call the base class with f32 since all of the f16/bf16 extensions depend on either F or Zfinx which will make f32 a legal type. The final representative register class further depends on whether D or Zdinx is also enabled, but that should be handled by the default implementation. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 30 +++++++++++++++++++++ llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 +++ 2 files changed, 33 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 60fc024f0d274..35040734d71df 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22051,6 +22051,36 @@ SDValue RISCVTargetLowering::expandIndirectJTBranch(const SDLoc &dl, return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG); } +// If an output pattern produces multiple instructions tablegen may pick an +// arbitrary type from an instructions destination register class to use for the +// VT of that MachineSDNode. This VT may be used to look up the representative +// register class. If the type isn't legal, the default implementation will +// not find a register class. +// +// Some integer types smaller than XLen are listed in the GPR register class to +// support isel patterns for GISel, but are not legal in SelectionDAG. The +// arbitrary type tablegen picks may be one of these smaller types. +// +// f16 and bf16 are both valid for the FPR16 or GPRF16 register class. It's +// possible for tablegen to pick bf16 as the arbitrary type for an f16 pattern. +std::pair +RISCVTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { + switch (VT.SimpleTy) { + default: + break; + case MVT::i8: + case MVT::i16: + case MVT::i32: + return TargetLowering::findRepresentativeClass(TRI, Subtarget.getXLenVT()); + case MVT::bf16: + case MVT::f16: + return TargetLowering::findRepresentativeClass(TRI, MVT::f32); + } + + return TargetLowering::findRepresentativeClass(TRI, VT); +} + namespace llvm::RISCVVIntrinsicsTable { #define GET_RISCVVIntrinsicsTable_IMPL diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 773729d69a143..7ada941563c1f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -1068,6 +1068,9 @@ class RISCVTargetLowering : public TargetLowering { SDValue emitFlushICache(SelectionDAG &DAG, SDValue InChain, SDValue Start, SDValue End, SDValue Flags, SDLoc DL) const; + + std::pair + findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override; }; namespace RISCVVIntrinsicsTable { From 589ab28d87616006d7f8cf2402379811e2a6183f Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Mon, 18 Nov 2024 10:08:44 -0800 Subject: [PATCH 012/366] [lldb][sbapi][NFC] Remove commented out typedef from SBBreakpointName (#116434) SBBreakpointName has a typedef for BreakpointHitCallback used in SetCallback(), but this typedef has been commented out in SBBreakpointName and added instead to SBDefines. Since SB API callbacks are placed in SBDefines, this commit removes this commented out portion. --- lldb/include/lldb/API/SBBreakpointName.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lldb/include/lldb/API/SBBreakpointName.h b/lldb/include/lldb/API/SBBreakpointName.h index 838c66385bd12..4b7ad0cce345e 100644 --- a/lldb/include/lldb/API/SBBreakpointName.h +++ b/lldb/include/lldb/API/SBBreakpointName.h @@ -17,10 +17,6 @@ namespace lldb { class LLDB_API SBBreakpointName { public: -// typedef bool (*BreakpointHitCallback)(void *baton, SBProcess &process, -// SBThread &thread, -// lldb::SBBreakpointLocation &location); - SBBreakpointName(); SBBreakpointName(SBTarget &target, const char *name); From a7b2e73bcaa91255a20f1f2e692bec9eb6c17022 Mon Sep 17 00:00:00 2001 From: Greg Clayton Date: Mon, 18 Nov 2024 10:18:11 -0800 Subject: [PATCH 013/366] Add support for reading the dynamic symbol table from PT_DYNAMIC (#112596) Allow LLDB to parse the dynamic symbol table from an ELF file or memory image in an ELF file that has no section headers. This patch uses the ability to parse the PT_DYNAMIC segment and find the DT_SYMTAB, DT_SYMENT, DT_HASH or DT_GNU_HASH to find and parse the dynamic symbol table if the section headers are not present. It also adds a helper function to read data from a .dynamic key/value pair entry correctly from the file or from memory. --- .../Plugins/ObjectFile/ELF/ObjectFileELF.cpp | 182 ++++++++++++++++-- .../Plugins/ObjectFile/ELF/ObjectFileELF.h | 41 ++++ .../test/Shell/ObjectFile/ELF/elf-dynsym.test | 42 ++++ 3 files changed, 244 insertions(+), 21 deletions(-) create mode 100644 lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 9c7dff8127f47..8df226817326d 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -44,6 +44,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/MipsABIFlags.h" +#include "lldb/Target/Process.h" #define CASE_AND_STREAM(s, def, width) \ case def: \ @@ -3007,9 +3008,10 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) { // section, nomatter if .symtab was already parsed or not. This is because // minidebuginfo normally removes the .symtab symbols which have their // matching .dynsym counterparts. + Section *dynsym = nullptr; if (!symtab || GetSectionList()->FindSectionByName(ConstString(".gnu_debugdata"))) { - Section *dynsym = + dynsym = section_list->FindSectionByType(eSectionTypeELFDynamicSymbols, true) .get(); if (dynsym) { @@ -3019,6 +3021,20 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) { m_address_class_map.merge(address_class_map); } } + if (!dynsym) { + // Try and read the dynamic symbol table from the .dynamic section. + uint32_t num_symbols = 0; + std::optional symtab_data = + GetDynsymDataFromDynamic(num_symbols); + std::optional strtab_data = GetDynstrData(); + if (symtab_data && strtab_data) { + auto [num_symbols_parsed, address_class_map] = + ParseSymbols(&lldb_symtab, symbol_id, section_list, num_symbols, + symtab_data.value(), strtab_data.value()); + symbol_id += num_symbols_parsed; + m_address_class_map.merge(address_class_map); + } + } // DT_JMPREL // If present, this entry's d_ptr member holds the address of @@ -3828,6 +3844,33 @@ ObjectFileELF::MapFileDataWritable(const FileSpec &file, uint64_t Size, Offset); } +std::optional +ObjectFileELF::ReadDataFromDynamic(const ELFDynamic *dyn, uint64_t length, + uint64_t offset) { + // ELFDynamic values contain a "d_ptr" member that will be a load address if + // we have an ELF file read from memory, or it will be a file address if it + // was read from a ELF file. This function will correctly fetch data pointed + // to by the ELFDynamic::d_ptr, or return std::nullopt if the data isn't + // available. + const lldb::addr_t d_ptr_addr = dyn->d_ptr + offset; + if (ProcessSP process_sp = m_process_wp.lock()) { + if (DataBufferSP data_sp = ReadMemory(process_sp, d_ptr_addr, length)) + return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize()); + } else { + // We have an ELF file with no section headers or we didn't find the + // .dynamic section. Try and find the .dynstr section. + Address addr; + if (!addr.ResolveAddressUsingFileSections(d_ptr_addr, GetSectionList())) + return std::nullopt; + DataExtractor data; + addr.GetSection()->GetSectionData(data); + return DataExtractor(data, + d_ptr_addr - addr.GetSection()->GetFileAddress(), + length); + } + return std::nullopt; +} + std::optional ObjectFileELF::GetDynstrData() { if (SectionList *section_list = GetSectionList()) { // Find the SHT_DYNAMIC section. @@ -3855,31 +3898,15 @@ std::optional ObjectFileELF::GetDynstrData() { // and represent the dynamic symbol tables's string table. These are needed // by the dynamic loader and we can read them from a process' address space. // - // When loading and ELF file from memory, only the program headers end up - // being mapped into memory, and we can find these values in the PT_DYNAMIC - // segment. + // When loading and ELF file from memory, only the program headers are + // guaranteed end up being mapped into memory, and we can find these values in + // the PT_DYNAMIC segment. const ELFDynamic *strtab = FindDynamicSymbol(DT_STRTAB); const ELFDynamic *strsz = FindDynamicSymbol(DT_STRSZ); if (strtab == nullptr || strsz == nullptr) return std::nullopt; - if (ProcessSP process_sp = m_process_wp.lock()) { - if (DataBufferSP data_sp = - ReadMemory(process_sp, strtab->d_ptr, strsz->d_val)) - return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize()); - } else { - // We have an ELF file with no section headers or we didn't find the - // .dynamic section. Try and find the .dynstr section. - Address addr; - if (addr.ResolveAddressUsingFileSections(strtab->d_ptr, GetSectionList())) { - DataExtractor data; - addr.GetSection()->GetSectionData(data); - return DataExtractor(data, - strtab->d_ptr - addr.GetSection()->GetFileAddress(), - strsz->d_val); - } - } - return std::nullopt; + return ReadDataFromDynamic(strtab, strsz->d_val, /*offset=*/0); } std::optional ObjectFileELF::GetDynamicData() { @@ -3912,3 +3939,116 @@ std::optional ObjectFileELF::GetDynamicData() { } return std::nullopt; } + +std::optional ObjectFileELF::GetNumSymbolsFromDynamicHash() { + const ELFDynamic *hash = FindDynamicSymbol(DT_HASH); + if (hash == nullptr) + return std::nullopt; + + // The DT_HASH header looks like this: + struct DtHashHeader { + uint32_t nbucket; + uint32_t nchain; + }; + if (auto data = ReadDataFromDynamic(hash, 8)) { + // We don't need the number of buckets value "nbucket", we just need the + // "nchain" value which contains the number of symbols. + offset_t offset = offsetof(DtHashHeader, nchain); + return data->GetU32(&offset); + } + + return std::nullopt; +} + +std::optional ObjectFileELF::GetNumSymbolsFromDynamicGnuHash() { + const ELFDynamic *gnu_hash = FindDynamicSymbol(DT_GNU_HASH); + if (gnu_hash == nullptr) + return std::nullopt; + + // Create a DT_GNU_HASH header + // https://flapenguin.me/elf-dt-gnu-hash + struct DtGnuHashHeader { + uint32_t nbuckets = 0; + uint32_t symoffset = 0; + uint32_t bloom_size = 0; + uint32_t bloom_shift = 0; + }; + uint32_t num_symbols = 0; + // Read enogh data for the DT_GNU_HASH header so we can extract the values. + if (auto data = ReadDataFromDynamic(gnu_hash, sizeof(DtGnuHashHeader))) { + offset_t offset = 0; + DtGnuHashHeader header; + header.nbuckets = data->GetU32(&offset); + header.symoffset = data->GetU32(&offset); + header.bloom_size = data->GetU32(&offset); + header.bloom_shift = data->GetU32(&offset); + const size_t addr_size = GetAddressByteSize(); + const addr_t buckets_offset = + sizeof(DtGnuHashHeader) + addr_size * header.bloom_size; + std::vector buckets; + if (auto bucket_data = ReadDataFromDynamic(gnu_hash, header.nbuckets * 4, buckets_offset)) { + offset = 0; + for (uint32_t i = 0; i < header.nbuckets; ++i) + buckets.push_back(bucket_data->GetU32(&offset)); + // Locate the chain that handles the largest index bucket. + uint32_t last_symbol = 0; + for (uint32_t bucket_value : buckets) + last_symbol = std::max(bucket_value, last_symbol); + if (last_symbol < header.symoffset) { + num_symbols = header.symoffset; + } else { + // Walk the bucket's chain to add the chain length to the total. + const addr_t chains_base_offset = buckets_offset + header.nbuckets * 4; + for (;;) { + if (auto chain_entry_data = ReadDataFromDynamic(gnu_hash, 4, chains_base_offset + (last_symbol - header.symoffset) * 4)) { + offset = 0; + uint32_t chain_entry = chain_entry_data->GetU32(&offset); + ++last_symbol; + // If the low bit is set, this entry is the end of the chain. + if (chain_entry & 1) + break; + } else { + break; + } + } + num_symbols = last_symbol; + } + } + } + if (num_symbols > 0) + return num_symbols; + + return std::nullopt; +} + +std::optional +ObjectFileELF::GetDynsymDataFromDynamic(uint32_t &num_symbols) { + // Every ELF file which represents an executable or shared library has + // mandatory .dynamic entries. The DT_SYMTAB value contains a pointer to the + // symbol table, and DT_SYMENT contains the size of a symbol table entry. + // We then can use either the DT_HASH or DT_GNU_HASH to find the number of + // symbols in the symbol table as the symbol count is not stored in the + // .dynamic section as a key/value pair. + // + // When loading and ELF file from memory, only the program headers end up + // being mapped into memory, and we can find these values in the PT_DYNAMIC + // segment. + num_symbols = 0; + // Get the process in case this is an in memory ELF file. + ProcessSP process_sp(m_process_wp.lock()); + const ELFDynamic *symtab = FindDynamicSymbol(DT_SYMTAB); + const ELFDynamic *syment = FindDynamicSymbol(DT_SYMENT); + // DT_SYMTAB and DT_SYMENT are mandatory. + if (symtab == nullptr || syment == nullptr) + return std::nullopt; + + if (std::optional syms = GetNumSymbolsFromDynamicHash()) + num_symbols = *syms; + else if (std::optional syms = GetNumSymbolsFromDynamicGnuHash()) + num_symbols = *syms; + else + return std::nullopt; + if (num_symbols == 0) + return std::nullopt; + return ReadDataFromDynamic(symtab, syment->d_val * num_symbols); +} diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h index aba3a5bfcbf5b..16c216eb81e72 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h @@ -435,6 +435,47 @@ class ObjectFileELF : public lldb_private::ObjectFile { /// \return The bytes that represent the string table data or \c std::nullopt /// if an error occured. std::optional GetDynstrData(); + + /// Read the bytes pointed to by the \a dyn dynamic entry. + /// + /// ELFDynamic::d_ptr values contain file addresses if we load the ELF file + /// form a file on disk, or they contain load addresses if they were read + /// from memory. This function will correctly extract the data in both cases + /// if it is available. + /// + /// \param[in] dyn The dynamic entry to use to fetch the data from. + /// + /// \param[in] length The number of bytes to read. + /// + /// \param[in] offset The number of bytes to skip after the d_ptr value + /// before reading data. + /// + /// \return The bytes that represent the dynanic entries data or + /// \c std::nullopt if an error occured or the data is not available. + std::optional + ReadDataFromDynamic(const elf::ELFDynamic *dyn, uint64_t length, + uint64_t offset = 0); + + /// Get the bytes that represent the dynamic symbol table from the .dynamic + /// section from process memory. + /// + /// This functon uses the DT_SYMTAB value from the .dynamic section to read + /// the symbols table data from process memory. The number of symbols in the + /// symbol table is calculated by looking at the DT_HASH or DT_GNU_HASH + /// values as the symbol count isn't stored in the .dynamic section. + /// + /// \return The bytes that represent the symbol table data from the .dynamic + /// section or section headers or \c std::nullopt if an error + /// occured or if there is no dynamic symbol data available. + std::optional + GetDynsymDataFromDynamic(uint32_t &num_symbols); + + /// Get the number of symbols from the DT_HASH dynamic entry. + std::optional GetNumSymbolsFromDynamicHash(); + + /// Get the number of symbols from the DT_GNU_HASH dynamic entry. + std::optional GetNumSymbolsFromDynamicGnuHash(); + }; #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_ELF_OBJECTFILEELF_H diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test b/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test new file mode 100644 index 0000000000000..7d948e2cd225c --- /dev/null +++ b/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test @@ -0,0 +1,42 @@ +// This test verifies that loading an ELF file that has no section headers can +// load the dynamic symbol table using the DT_SYMTAB, DT_SYMENT, DT_HASH or +// the DT_GNU_HASH .dynamic key/value pairs that are loaded via the PT_DYNAMIC +// segment. + +// RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj \ +// RUN: -o - - <<<".globl defined, undefined; defined:" | \ +// RUN: ld.lld /dev/stdin -o - --hash-style=gnu -export-dynamic -shared \ +// RUN: -z nosectionheader -o %t.gnu +// RUN: %lldb %t.gnu -b \ +// RUN: -o "image dump objfile" \ +// RUN: | FileCheck %s --dump-input=always --check-prefix=GNU +// GNU: (lldb) image dump objfile +// GNU: Dumping headers for 1 module(s). +// GNU: ObjectFileELF, file = +// GNU: ELF Header +// GNU: e_type = 0x0003 ET_DYN +// Make sure there are no section headers +// GNU: e_shnum = 0x00000000 +// Make sure we were able to load the symbols +// GNU: Symtab, file = {{.*}}elf-dynsym.test.tmp.gnu, num_symbols = 2: +// GNU-DAG: undefined +// GNU-DAG: defined + +// RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj \ +// RUN: -o - - <<<".globl defined, undefined; defined:" | \ +// RUN: ld.lld /dev/stdin -o - --hash-style=sysv -export-dynamic -shared \ +// RUN: -z nosectionheader -o %t.sysv +// RUN: %lldb %t.sysv -b \ +// RUN: -o "image dump objfile" \ +// RUN: | FileCheck %s --dump-input=always --check-prefix=HASH +// HASH: (lldb) image dump objfile +// HASH: Dumping headers for 1 module(s). +// HASH: ObjectFileELF, file = +// HASH: ELF Header +// HASH: e_type = 0x0003 ET_DYN +// Make sure there are no section headers +// HASH: e_shnum = 0x00000000 +// Make sure we were able to load the symbols +// HASH: Symtab, file = {{.*}}elf-dynsym.test.tmp.sysv, num_symbols = 2: +// HASH-DAG: undefined +// HASH-DAG: defined From ab4253f6dff194a1e09448c8628809d21f148df9 Mon Sep 17 00:00:00 2001 From: Michele Scandale Date: Mon, 18 Nov 2024 10:24:09 -0800 Subject: [PATCH 014/366] [Analysis] Remove global state from `PluginInline{Advisor,Order}Analysis`. (#114615) The plugin analysis for `InlineAdvisor` and `InlineOrder` currently relies on shared global state to keep track if the analysis is available. This causes issues when pipelines using plugins and pipelines not using plugins are run in the same process. The shared global state can be easily replaced by checking in the given instance of `ModuleAnalysisManager` if the plugin analysis has been registered. --- llvm/include/llvm/Analysis/InlineAdvisor.h | 2 - llvm/include/llvm/Analysis/InlineOrder.h | 5 --- llvm/include/llvm/IR/PassManager.h | 12 ++++-- llvm/lib/Analysis/InlineAdvisor.cpp | 3 +- llvm/lib/Analysis/InlineOrder.cpp | 3 +- .../PluginInlineAdvisorAnalysisTest.cpp | 41 +++++-------------- .../PluginInlineOrderAnalysisTest.cpp | 6 --- 7 files changed, 20 insertions(+), 52 deletions(-) diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h index 700c3b0f18b8d..b002bec2c9183 100644 --- a/llvm/include/llvm/Analysis/InlineAdvisor.h +++ b/llvm/include/llvm/Analysis/InlineAdvisor.h @@ -287,7 +287,6 @@ class PluginInlineAdvisorAnalysis : public AnalysisInfoMixin { public: static AnalysisKey Key; - static bool HasBeenRegistered; typedef InlineAdvisor *(*AdvisorFactory)(Module &M, FunctionAnalysisManager &FAM, @@ -295,7 +294,6 @@ class PluginInlineAdvisorAnalysis InlineContext IC); PluginInlineAdvisorAnalysis(AdvisorFactory Factory) : Factory(Factory) { - HasBeenRegistered = true; assert(Factory != nullptr && "The plugin advisor factory should not be a null pointer."); } diff --git a/llvm/include/llvm/Analysis/InlineOrder.h b/llvm/include/llvm/Analysis/InlineOrder.h index 2fa2d6091303a..498cef314b5c3 100644 --- a/llvm/include/llvm/Analysis/InlineOrder.h +++ b/llvm/include/llvm/Analysis/InlineOrder.h @@ -59,7 +59,6 @@ class PluginInlineOrderAnalysis ModuleAnalysisManager &MAM, Module &M); PluginInlineOrderAnalysis(InlineOrderFactory Factory) : Factory(Factory) { - HasBeenRegistered = true; assert(Factory != nullptr && "The plugin inline order factory should not be a null pointer."); } @@ -71,11 +70,7 @@ class PluginInlineOrderAnalysis Result run(Module &, ModuleAnalysisManager &) { return {Factory}; } Result getResult() { return {Factory}; } - static bool isRegistered() { return HasBeenRegistered; } - static void unregister() { HasBeenRegistered = false; } - private: - static bool HasBeenRegistered; InlineOrderFactory Factory; }; diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h index d269221fac070..5dab9d0d0a797 100644 --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -398,6 +398,11 @@ template class AnalysisManager { AnalysisResultLists.clear(); } + /// Returns true if the specified analysis pass is registered. + template bool isPassRegistered() const { + return AnalysisPasses.count(PassT::ID()); + } + /// Get the result of an analysis pass for a given IR unit. /// /// Runs the analysis if a cached result is not available. @@ -458,10 +463,9 @@ template class AnalysisManager { /// and this function returns true. /// /// (Note: Although the return value of this function indicates whether or not - /// an analysis was previously registered, there intentionally isn't a way to - /// query this directly. Instead, you should just register all the analyses - /// you might want and let this class run them lazily. This idiom lets us - /// minimize the number of times we have to look up analyses in our + /// an analysis was previously registered, you should just register all the + /// analyses you might want and let this class run them lazily. This idiom + /// lets us minimize the number of times we have to look up analyses in our /// hashtable.) template bool registerPass(PassBuilderT &&PassBuilder) { diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp index 45702fa25d8b1..12553dd446a61 100644 --- a/llvm/lib/Analysis/InlineAdvisor.cpp +++ b/llvm/lib/Analysis/InlineAdvisor.cpp @@ -199,13 +199,12 @@ void InlineAdvice::recordInliningWithCalleeDeleted() { AnalysisKey InlineAdvisorAnalysis::Key; AnalysisKey PluginInlineAdvisorAnalysis::Key; -bool PluginInlineAdvisorAnalysis::HasBeenRegistered = false; bool InlineAdvisorAnalysis::Result::tryCreate( InlineParams Params, InliningAdvisorMode Mode, const ReplayInlinerSettings &ReplaySettings, InlineContext IC) { auto &FAM = MAM.getResult(M).getManager(); - if (PluginInlineAdvisorAnalysis::HasBeenRegistered) { + if (MAM.isPassRegistered()) { auto &DA = MAM.getResult(M); Advisor.reset(DA.Factory(M, FAM, Params, IC)); return !!Advisor; diff --git a/llvm/lib/Analysis/InlineOrder.cpp b/llvm/lib/Analysis/InlineOrder.cpp index f156daa2f126f..8d920153f250d 100644 --- a/llvm/lib/Analysis/InlineOrder.cpp +++ b/llvm/lib/Analysis/InlineOrder.cpp @@ -283,7 +283,6 @@ class PriorityInlineOrder : public InlineOrder> { } // namespace AnalysisKey llvm::PluginInlineOrderAnalysis::Key; -bool llvm::PluginInlineOrderAnalysis::HasBeenRegistered; std::unique_ptr>> llvm::getDefaultInlineOrder(FunctionAnalysisManager &FAM, @@ -313,7 +312,7 @@ llvm::getDefaultInlineOrder(FunctionAnalysisManager &FAM, std::unique_ptr>> llvm::getInlineOrder(FunctionAnalysisManager &FAM, const InlineParams &Params, ModuleAnalysisManager &MAM, Module &M) { - if (llvm::PluginInlineOrderAnalysis::isRegistered()) { + if (MAM.isPassRegistered()) { LLVM_DEBUG(dbgs() << " Current used priority: plugin ---- \n"); return MAM.getResult(M).Factory(FAM, Params, MAM, M); diff --git a/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp b/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp index 3330751120e6c..92c0b1bcacb12 100644 --- a/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp +++ b/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp @@ -87,33 +87,10 @@ struct CompilerInstance { ThinOrFullLTOPhase::None)); } - ~CompilerInstance() { - // Reset the static variable that tracks if the plugin has been registered. - // This is needed to allow the test to run multiple times. - PluginInlineAdvisorAnalysis::HasBeenRegistered = false; - } - std::string output; std::unique_ptr outputM; - // run with the default inliner - auto run_default(StringRef IR) { - PluginInlineAdvisorAnalysis::HasBeenRegistered = false; - outputM = parseAssemblyString(IR, Error, Ctx); - MPM.run(*outputM, MAM); - ASSERT_TRUE(outputM); - output.clear(); - raw_string_ostream o_stream{output}; - outputM->print(o_stream, nullptr); - ASSERT_TRUE(true); - } - - // run with the dnamic inliner - auto run_dynamic(StringRef IR) { - // note typically the constructor for the DynamicInlineAdvisorAnalysis - // will automatically set this to true, we controll it here only to - // altenate between the default and dynamic inliner in our test - PluginInlineAdvisorAnalysis::HasBeenRegistered = true; + auto run(StringRef IR) { outputM = parseAssemblyString(IR, Error, Ctx); MPM.run(*outputM, MAM); ASSERT_TRUE(outputM); @@ -274,14 +251,16 @@ TEST(PluginInlineAdvisorTest, PluginLoad) { // Skip the test if plugins are disabled. GTEST_SKIP(); #endif - CompilerInstance CI{}; - CI.setupPlugin(); + CompilerInstance DefaultCI{}; + + CompilerInstance PluginCI{}; + PluginCI.setupPlugin(); for (StringRef IR : TestIRS) { - CI.run_default(IR); - std::string default_output = CI.output; - CI.run_dynamic(IR); - std::string dynamic_output = CI.output; + DefaultCI.run(IR); + std::string default_output = DefaultCI.output; + PluginCI.run(IR); + std::string dynamic_output = PluginCI.output; ASSERT_EQ(default_output, dynamic_output); } } @@ -294,7 +273,7 @@ TEST(PluginInlineAdvisorTest, CustomAdvisor) { CI.setupFooOnly(); for (StringRef IR : TestIRS) { - CI.run_dynamic(IR); + CI.run(IR); CallGraph CGraph = CallGraph(*CI.outputM); for (auto &node : CGraph) { for (auto &edge : *node.second) { diff --git a/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp b/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp index ca860a0dd5584..0b31b0892d75a 100644 --- a/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp +++ b/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp @@ -61,12 +61,6 @@ struct CompilerInstance { ThinOrFullLTOPhase::None)); } - ~CompilerInstance() { - // Reset the static variable that tracks if the plugin has been registered. - // This is needed to allow the test to run multiple times. - PluginInlineOrderAnalysis::unregister(); - } - std::string Output; std::unique_ptr OutputM; From ed8ebad6eb84af60d1c1a8826f55d4d347d2e7bd Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Mon, 18 Nov 2024 13:32:58 -0500 Subject: [PATCH 015/366] [SelectionDAG] Support integer promotion for VP_LOAD and VP_STORE (#81299) Add integer promotion support for for VP_LOAD and VP_STORE via legalization of extend and truncate of each form. Patch commandeered from: https://reviews.llvm.org/D109377 --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 36 +++++++++++++++++++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 ++ .../RISCV/rvv/fixed-vectors-vpstore.ll | 16 +++++++-- llvm/test/CodeGen/RISCV/rvv/vpload.ll | 28 ++++++++++----- llvm/test/CodeGen/RISCV/rvv/vpstore.ll | 28 ++++++++++----- 5 files changed, 92 insertions(+), 18 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 45487c887b74d..648719bcabc37 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -83,6 +83,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break; case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N)); break; + case ISD::VP_LOAD: + Res = PromoteIntRes_VP_LOAD(cast(N)); + break; case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N)); break; case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast(N)); @@ -957,6 +960,23 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) { return Res; } +SDValue DAGTypeLegalizer::PromoteIntRes_VP_LOAD(VPLoadSDNode *N) { + assert(!N->isIndexed() && "Indexed vp_load during type legalization!"); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + ISD::LoadExtType ExtType = (N->getExtensionType() == ISD::NON_EXTLOAD) + ? ISD::EXTLOAD + : N->getExtensionType(); + SDLoc dl(N); + SDValue Res = + DAG.getLoadVP(N->getAddressingMode(), ExtType, NVT, dl, N->getChain(), + N->getBasePtr(), N->getOffset(), N->getMask(), + N->getVectorLength(), N->getMemoryVT(), N->getMemOperand()); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue ExtPassThru = GetPromotedInteger(N->getPassThru()); @@ -1957,6 +1977,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::STRICT_SINT_TO_FP: Res = PromoteIntOp_STRICT_SINT_TO_FP(N); break; case ISD::STORE: Res = PromoteIntOp_STORE(cast(N), OpNo); break; + case ISD::VP_STORE: + Res = PromoteIntOp_VP_STORE(cast(N), OpNo); + break; case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast(N), OpNo); break; case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N), @@ -2378,6 +2401,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){ N->getMemoryVT(), N->getMemOperand()); } +SDValue DAGTypeLegalizer::PromoteIntOp_VP_STORE(VPStoreSDNode *N, + unsigned OpNo) { + + assert(OpNo == 1 && "Unexpected operand for promotion"); + assert(!N->isIndexed() && "expecting unindexed vp_store!"); + + SDValue DataOp = GetPromotedInteger(N->getValue()); + return DAG.getTruncStoreVP(N->getChain(), SDLoc(N), DataOp, N->getBasePtr(), + N->getMask(), N->getVectorLength(), + N->getMemoryVT(), N->getMemOperand(), + N->isCompressingStore()); +} + SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo) { SDValue DataOp = N->getValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index a56cd5423e00b..6eb7628675e64 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -338,6 +338,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_FREEZE(SDNode *N); SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); + SDValue PromoteIntRes_VP_LOAD(VPLoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N); SDValue PromoteIntRes_VECTOR_COMPRESS(SDNode *N); @@ -420,6 +421,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_ExpOp(SDNode *N); SDValue PromoteIntOp_VECREDUCE(SDNode *N); SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_SET_ROUNDING(SDNode *N); SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll index 8eaa5efe163cd..d30e8b46e6df2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll @@ -28,6 +28,18 @@ define void @vpstore_v4i8(<4 x i8> %val, ptr %ptr, <4 x i1> %m, i32 zeroext %evl ret void } +declare void @llvm.vp.store.v8i7.v8i7.p0(<8 x i7>, ptr, <8 x i1>, i32) + +define void @vpstore_v8i7(<8 x i7> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpstore_v8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.v8i7.v8i7.p0(<8 x i7> %val, ptr %ptr, <8 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.vp.store.v8i8.p0(<8 x i8>, ptr, <8 x i1>, i32) define void @vpstore_v8i8(<8 x i8> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { @@ -285,10 +297,10 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero ; CHECK: # %bb.0: ; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: bltu a1, a3, .LBB23_2 +; CHECK-NEXT: bltu a1, a3, .LBB24_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: .LBB24_2: ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v8, (a0), v0.t ; CHECK-NEXT: addi a2, a1, -16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll index 8dfab72d008c2..bd7ea6c19d0b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll @@ -65,6 +65,18 @@ define @vpload_nxv3i8(ptr %ptr, %m, i32 zero ret %load } +declare @llvm.vp.load.nxv4i6.nxv4i6.p0(*, , i32) + +define @vpload_nxv4i6(* %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpload_nxv4i6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.vp.load.nxv4i6.nxv4i6.p0(* %ptr, %m, i32 %evl) + ret %load +} + declare @llvm.vp.load.nxv4i8.p0(ptr, , i32) define @vpload_nxv4i8(ptr %ptr, %m, i32 zeroext %evl) { @@ -523,10 +535,10 @@ define @vpload_nxv16f64(ptr %ptr, %m, ; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a4), v0.t -; CHECK-NEXT: bltu a1, a2, .LBB43_2 +; CHECK-NEXT: bltu a1, a2, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: .LBB43_2: +; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0), v0.t @@ -553,10 +565,10 @@ define @vpload_nxv17f64(ptr %ptr, ptr %out, @vpload_nxv17f64(ptr %ptr, ptr %out, @vpload_nxv17f64(ptr %ptr, ptr %out, %val, ptr %ptr, , *, , i32) + +define void @vpstore_nxv8i12( %val, * %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpstore_nxv8i12: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.vp.store.nxv8i12.nxv8i12.p0( %val, * %ptr, %m, i32 %evl) + ret void +} + declare void @llvm.vp.store.nxv8i16.p0(, ptr, , i32) define void @vpstore_nxv8i16( %val, ptr %ptr, %m, i32 zeroext %evl) { @@ -421,10 +433,10 @@ define void @vpstore_nxv16f64( %val, ptr %ptr, %val, ptr %ptr, %val, ptr %ptr, Date: Mon, 18 Nov 2024 18:35:44 +0000 Subject: [PATCH 016/366] [NVPTX][NFC] Regenerate some tests checks (#116605) Use update_llc_test_checks.py to automate the test checks in some files I was observing changes in locally. --- .../NVPTX/bf16x2-instructions-approx.ll | 61 +- .../test/CodeGen/NVPTX/bf16x2-instructions.ll | 1027 ++++-- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 2858 +++++++++++------ llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 1110 ++++--- llvm/test/CodeGen/NVPTX/i8x2-instructions.ll | 41 +- 5 files changed, 3318 insertions(+), 1779 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll index a53c90ac6db8b..3e54aaf558072 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | FileCheck --check-prefixes=CHECK %s ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | %ptxas-verify -arch=sm_80 %} @@ -6,36 +7,48 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0 declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0 -; CHECK-LABEL: test_sin( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_sin_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.bf16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.bf16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 { +; CHECK-LABEL: test_sin( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_sin_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2; +; CHECK-NEXT: sin.approx.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1; +; CHECK-NEXT: sin.approx.f32 %f4, %f3; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } -; CHECK-LABEL: test_cos( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_cos_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.bf16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.bf16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 { +; CHECK-LABEL: test_cos( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_cos_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2; +; CHECK-NEXT: cos.approx.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1; +; CHECK-NEXT: cos.approx.f32 %f4, %f3; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index 925ae4245a4c2..e545d4c117791 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} @@ -5,163 +6,231 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -; CHECK-LABEL: test_ret_const( -; CHECK: mov.b32 [[T:%r[0-9+]]], 1073758080; -; CHECK: st.param.b32 [func_retval0], [[T]]; -; CHECK-NEXT: ret; - define <2 x bfloat> @test_ret_const() #0 { +; CHECK-LABEL: test_ret_const( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b32 %r1, 1073758080; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; ret <2 x bfloat> } ; Check that we can lower fadd with immediate arguments. -; CHECK-LABEL: test_fadd_imm_0( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fadd_imm_0_param_0]; -; -; SM90-DAG: mov.b32 [[I:%r[0-9+]]], 1073758080; -; SM90-DAG: add.rn.bf16x2 [[R:%r[0-9]+]], [[A]], [[I]]; -; -; SM80-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; SM80-DAG: cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]] -; SM80-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]] -; SM80-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; -; SM80-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; -; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; SM80-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; - define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 { +; SM80-LABEL: test_fadd_imm_0( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<5>; +; SM80-NEXT: .reg .b32 %r<3>; +; SM80-NEXT: .reg .f32 %f<5>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs2; +; SM80-NEXT: add.rn.f32 %f2, %f1, 0f40000000; +; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; SM80-NEXT: cvt.f32.bf16 %f3, %rs1; +; SM80-NEXT: add.rn.f32 %f4, %f3, 0f3F800000; +; SM80-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; SM80-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM80-NEXT: st.param.b32 [func_retval0], %r2; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_fadd_imm_0( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; +; SM90-NEXT: mov.b32 %r2, 1073758080; +; SM90-NEXT: add.rn.bf16x2 %r3, %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-NEXT: ret; %r = fadd <2 x bfloat> , %a ret <2 x bfloat> %r } -; CHECK-LABEL: test_fadd_imm_1( -; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fadd_imm_1_param_0]; -; SM90: mov.b16 [[B:%rs[0-9]+]], 0x3F80; -; SM90: add.rn.bf16 [[R:%rs[0-9]+]], [[A]], [[B]]; - -; SM80-DAG: cvt.f32.bf16 [[FA:%f[0-9]+]], [[A]]; -; SM80: add.rn.f32 [[FR:%f[0-9]+]], [[FA]], 0f3F800000; -; SM80: cvt.rn.bf16.f32 [[R:%rs[0-9]+]], [[FR]]; - -; CHECK: st.param.b16 [func_retval0], [[R]]; -; CHECK-NEXT: ret; - define bfloat @test_fadd_imm_1(bfloat %a) #0 { +; SM80-LABEL: test_fadd_imm_1( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<3>; +; SM80-NEXT: .reg .f32 %f<3>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; +; SM80-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; +; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2; +; SM80-NEXT: st.param.b16 [func_retval0], %rs2; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_fadd_imm_1( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<4>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; +; SM90-NEXT: mov.b16 %rs2, 0x3F80; +; SM90-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-NEXT: ret; %r = fadd bfloat %a, 1.0 ret bfloat %r } -; CHECK-LABEL: test_fsubx2( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fsubx2_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fsubx2_param_1]; -; SM90: sub.rn.bf16x2 [[R:%r[0-9]+]], [[A]], [[B]]; - -; SM80-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; SM80-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]; -; SM80-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]]; -; SM80-DAG: cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]]; -; SM80-DAG: cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]]; -; SM80-DAG: cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]]; -; SM80-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; SM80-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]]; -; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]; -; SM80: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}; - -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; - define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { +; SM80-LABEL: test_fsubx2( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<7>; +; SM80-NEXT: .reg .b32 %r<4>; +; SM80-NEXT: .reg .f32 %f<7>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; +; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs2; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f2, %rs4; +; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1; +; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3; +; SM80-NEXT: cvt.f32.bf16 %f4, %rs1; +; SM80-NEXT: cvt.f32.bf16 %f5, %rs3; +; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4; +; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6; +; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_fsubx2( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_fsubx2_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [test_fsubx2_param_0]; +; SM90-NEXT: sub.rn.bf16x2 %r3, %r2, %r1; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-NEXT: ret; %r = fsub <2 x bfloat> %a, %b ret <2 x bfloat> %r } -; CHECK-LABEL: test_fmulx2( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fmulx2_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fmulx2_param_1]; -; SM90: mul.rn.bf16x2 [[R:%r[0-9]+]], [[A]], [[B]]; - -; SM80-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; SM80-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]; -; SM80-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]]; -; SM80-DAG: cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]]; -; SM80-DAG: cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]]; -; SM80-DAG: cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]]; -; SM80-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; SM80-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]]; -; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]; -; SM80: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}; - -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; - define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { +; SM80-LABEL: test_fmulx2( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<7>; +; SM80-NEXT: .reg .b32 %r<4>; +; SM80-NEXT: .reg .f32 %f<7>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0]; +; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs2; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f2, %rs4; +; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1; +; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3; +; SM80-NEXT: cvt.f32.bf16 %f4, %rs1; +; SM80-NEXT: cvt.f32.bf16 %f5, %rs3; +; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4; +; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6; +; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_fmulx2( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0]; +; SM90-NEXT: mul.rn.bf16x2 %r3, %r2, %r1; +; SM90-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-NEXT: ret; %r = fmul <2 x bfloat> %a, %b ret <2 x bfloat> %r } -; CHECK-LABEL: test_fdiv( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fdiv_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fdiv_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]; -; CHECK-NEXT: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; - define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { +; CHECK-LABEL: test_fdiv( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .f32 %f<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.bf16 %f2, %rs4; +; CHECK-NEXT: div.rn.f32 %f3, %f2, %f1; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs5, %f3; +; CHECK-NEXT: cvt.f32.bf16 %f4, %rs1; +; CHECK-NEXT: cvt.f32.bf16 %f5, %rs3; +; CHECK-NEXT: div.rn.f32 %f6, %f5, %f4; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs6, %f6; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = fdiv <2 x bfloat> %a, %b ret <2 x bfloat> %r } -; CHECK-LABEL: test_fneg( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_fneg_param_0]; - -; CHECK-DAG: xor.b32 [[IHH0:%r[0-9]+]], [[A]], -2147450880; -; CHECK-NEXT: st.param.b32 [func_retval0], [[IHH0]]; -; CHECK-NEXT: ret; define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 { +; CHECK-LABEL: test_fneg( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_fneg_param_0]; +; CHECK-NEXT: xor.b32 %r2, %r1, -2147450880; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = fneg <2 x bfloat> %a ret <2 x bfloat> %r } -; CHECK-LABEL: .func test_ldst_v2bf16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2bf16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2bf16_param_1]; -; CHECK-DAG: ld.b32 [[E:%r[0-9]+]], [%[[A]]] -; CHECK-DAG: st.b32 [%[[B]]], [[E]]; -; CHECK: ret; define void @test_ldst_v2bf16(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v2bf16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2bf16_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2bf16_param_1]; +; CHECK-NEXT: st.b32 [%rd2], %r1; +; CHECK-NEXT: ret; %t1 = load <2 x bfloat>, ptr %a store <2 x bfloat> %t1, ptr %b, align 16 ret void } -; CHECK-LABEL: .func test_ldst_v3bf16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3bf16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3bf16_param_1]; -; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair -; number of bitshifting instructions that may change at llvm's whim. -; So we only verify that we only issue correct number of writes using -; correct offset, but not the values we write. -; CHECK-DAG: ld.u64 -; CHECK-DAG: st.u32 [%[[B]]], -; CHECK-DAG: st.b16 [%[[B]]+4], -; CHECK: ret; define void @test_ldst_v3bf16(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v3bf16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3bf16_param_0]; +; CHECK-NEXT: ld.u64 %rd2, [%rd1]; +; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; } +; CHECK-NEXT: ld.param.u64 %rd3, [test_ldst_v3bf16_param_1]; +; CHECK-NEXT: st.u32 [%rd3], %rd2; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } +; CHECK-NEXT: st.b16 [%rd3+4], %rs1; +; CHECK-NEXT: ret; %t1 = load <3 x bfloat>, ptr %a store <3 x bfloat> %t1, ptr %b, align 16 ret void @@ -169,161 +238,241 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) { declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0 -; CHECK-LABEL: test_call( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_call_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_call_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0], [[A]]; -; CHECK-DAG: st.param.b32 [param1], [[B]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; - define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 { +; CHECK-LABEL: test_call( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .align 4 .b8 param1[4]; +; CHECK-NEXT: st.param.b32 [param1], %r2; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %r } -; CHECK-LABEL: test_select( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_select_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_select_param_1]; -; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] -; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; -; CHECK-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; - define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c) #0 { +; CHECK-LABEL: test_select( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_0]; +; CHECK-NEXT: selp.b32 %r3, %r2, %r1, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %r } -; CHECK-LABEL: test_select_cc( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_select_cc_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_select_cc_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%r[0-9]+]], [test_select_cc_param_2]; -; CHECK-DAG: ld.param.b32 [[D:%r[0-9]+]], [test_select_cc_param_3]; -; -; SM90: setp.neu.bf16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] -; -; SM80-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; SM80-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] -; SM80-DAG: cvt.f32.bf16 [[DF0:%f[0-9]+]], [[D0]]; -; SM80-DAG: cvt.f32.bf16 [[CF0:%f[0-9]+]], [[C0]]; -; SM80-DAG: cvt.f32.bf16 [[DF1:%f[0-9]+]], [[D1]]; -; SM80-DAG: cvt.f32.bf16 [[CF1:%f[0-9]+]], [[C1]]; -; SM80-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] -; SM80-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] -; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; - define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c, <2 x bfloat> %d) #0 { +; SM80-LABEL: test_select_cc( +; SM80: { +; SM80-NEXT: .reg .pred %p<3>; +; SM80-NEXT: .reg .b16 %rs<11>; +; SM80-NEXT: .reg .b32 %r<6>; +; SM80-NEXT: .reg .f32 %f<5>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; SM80-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; SM80-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; SM80-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; SM80-NEXT: cvt.f32.bf16 %f2, %rs3; +; SM80-NEXT: setp.neu.f32 %p1, %f2, %f1; +; SM80-NEXT: cvt.f32.bf16 %f3, %rs2; +; SM80-NEXT: cvt.f32.bf16 %f4, %rs4; +; SM80-NEXT: setp.neu.f32 %p2, %f4, %f3; +; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; SM80-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; +; SM80-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; +; SM80-NEXT: mov.b32 %r5, {%rs10, %rs9}; +; SM80-NEXT: st.param.b32 [func_retval0], %r5; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_select_cc( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<7>; +; SM90-NEXT: .reg .b32 %r<6>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; SM90-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; SM90-NEXT: ld.param.b32 %r3, [test_select_cc_param_3]; +; SM90-NEXT: ld.param.b32 %r4, [test_select_cc_param_2]; +; SM90-NEXT: setp.neu.bf16x2 %p1|%p2, %r4, %r3; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM90-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; SM90-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; +; SM90-NEXT: mov.b32 %r5, {%rs6, %rs5}; +; SM90-NEXT: st.param.b32 [func_retval0], %r5; +; SM90-NEXT: ret; %cc = fcmp une <2 x bfloat> %c, %d %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %r } - -; CHECK-LABEL: test_select_cc_f32_bf16( -; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_bf16_param_0]; -; CHECK-DAG: ld.param.b32 [[C:%r[0-9]+]], [test_select_cc_f32_bf16_param_2]; -; CHECK-DAG: ld.param.b32 [[D:%r[0-9]+]], [test_select_cc_f32_bf16_param_3]; -; SM90: setp.neu.bf16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] -; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_bf16_param_1]; - -; SM80-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; SM80-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] -; SM80-DAG: cvt.f32.bf16 [[DF0:%f[0-9]+]], [[D0]]; -; SM80-DAG: cvt.f32.bf16 [[CF0:%f[0-9]+]], [[C0]]; -; SM80-DAG: cvt.f32.bf16 [[DF1:%f[0-9]+]], [[D1]]; -; SM80-DAG: cvt.f32.bf16 [[CF1:%f[0-9]+]], [[C1]]; -; SM80-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] -; SM80-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] -; -; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b, +; SM80-LABEL: test_select_cc_f32_bf16( +; SM80: { +; SM80-NEXT: .reg .pred %p<3>; +; SM80-NEXT: .reg .b16 %rs<5>; +; SM80-NEXT: .reg .b32 %r<3>; +; SM80-NEXT: .reg .f32 %f<11>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0]; +; SM80-NEXT: ld.param.b32 %r1, [test_select_cc_f32_bf16_param_2]; +; SM80-NEXT: ld.param.b32 %r2, [test_select_cc_f32_bf16_param_3]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM80-NEXT: cvt.f32.bf16 %f3, %rs1; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f4, %rs3; +; SM80-NEXT: setp.neu.f32 %p1, %f4, %f3; +; SM80-NEXT: cvt.f32.bf16 %f5, %rs2; +; SM80-NEXT: cvt.f32.bf16 %f6, %rs4; +; SM80-NEXT: setp.neu.f32 %p2, %f6, %f5; +; SM80-NEXT: ld.param.v2.f32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1]; +; SM80-NEXT: selp.f32 %f9, %f2, %f8, %p2; +; SM80-NEXT: selp.f32 %f10, %f1, %f7, %p1; +; SM80-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9}; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_select_cc_f32_bf16( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b32 %r<3>; +; SM90-NEXT: .reg .f32 %f<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [test_select_cc_f32_bf16_param_3]; +; SM90-NEXT: ld.param.b32 %r2, [test_select_cc_f32_bf16_param_2]; +; SM90-NEXT: setp.neu.bf16x2 %p1|%p2, %r2, %r1; +; SM90-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1]; +; SM90-NEXT: selp.f32 %f5, %f2, %f4, %p2; +; SM90-NEXT: selp.f32 %f6, %f1, %f3, %p1; +; SM90-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5}; +; SM90-NEXT: ret; <2 x bfloat> %c, <2 x bfloat> %d) #0 { %cc = fcmp une <2 x bfloat> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r } -; CHECK-LABEL: test_select_cc_bf16_f32( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_select_cc_bf16_f32_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_select_cc_bf16_f32_param_1]; -; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_bf16_f32_param_2]; -; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_bf16_f32_param_3]; -; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, +; CHECK-LABEL: test_select_cc_bf16_f32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1]; +; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2]; +; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3]; +; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3; +; CHECK-NEXT: setp.neu.f32 %p2, %f2, %f4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; CHECK-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; <2 x float> %c, <2 x float> %d) #0 { %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %r } -; CHECK-LABEL: test_fptrunc_2xfloat( -; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptrunc_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs1, %f2; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs2, %f1; +; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %r = fptrunc <2 x float> %a to <2 x bfloat> ret <2 x bfloat> %r } -; CHECK-LABEL: test_fpext_2xfloat( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fpext_2xfloat_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.bf16 [[R0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.bf16 [[R1:%f[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]}; -; CHECK: ret; define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 { +; CHECK-LABEL: test_fpext_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2; +; CHECK-NEXT: cvt.f32.bf16 %f2, %rs1; +; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; +; CHECK-NEXT: ret; %r = fpext <2 x bfloat> %a to <2 x float> ret <2 x float> %r } -; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xbf16_to_2xi16_param_0]; -; CHECK: st.param.b32 [func_retval0], [[A]] -; CHECK: ret; define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 { +; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %r = bitcast <2 x bfloat> %a to <2 x i16> ret <2 x i16> %r } - -; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16( -; CHECK: ld.param.b32 [[R]], [test_bitcast_2xi16_to_2xbf16_param_0]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_bitcast_2xi16_to_2xbf16(<2 x i16> %a) #0 { +; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xi16_to_2xbf16_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %r = bitcast <2 x i16> %a to <2 x bfloat> ret <2 x bfloat> %r } @@ -351,184 +500,374 @@ declare <2 x bfloat> @llvm.nearbyint.f16(<2 x bfloat> %a) #0 declare <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) #0 declare <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 - -; CHECK-LABEL: test_sqrt( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_sqrt_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.bf16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.bf16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 { +; CHECK-LABEL: test_sqrt( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_sqrt_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2; +; CHECK-NEXT: sqrt.rn.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1; +; CHECK-NEXT: sqrt.rn.f32 %f4, %f3; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } -; CHECK-LABEL: test_fmuladd( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fmuladd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fmuladd_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%r[0-9]+]], [test_fmuladd_param_2]; -; -; CHECK: fma.rn.bf16x2 [[RA:%r[0-9]+]], [[A]], [[B]], [[C]]; -; CHECK-NEXT: st.param.b32 [func_retval0], [[RA]]; -; CHECK: ret; define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 { +; CHECK-LABEL: test_fmuladd( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fmuladd_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_fmuladd_param_0]; +; CHECK-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) ret <2 x bfloat> %r } -; CHECK-LABEL: test_fabs( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_fabs_param_0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[A]], 2147450879; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 { +; CHECK-LABEL: test_fabs( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_fabs_param_0]; +; CHECK-NEXT: and.b32 %r2, %r1, 2147450879; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } -; CHECK-LABEL: test_fabs_add( -; CHECK: abs.bf16x2 -; CHECK: ret; define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 { +; SM80-LABEL: test_fabs_add( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<11>; +; SM80-NEXT: .reg .b32 %r<6>; +; SM80-NEXT: .reg .f32 %f<11>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_fabs_add_param_1]; +; SM80-NEXT: ld.param.b32 %r2, [test_fabs_add_param_0]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs2; +; SM80-NEXT: add.rn.f32 %f2, %f1, %f1; +; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; SM80-NEXT: cvt.f32.bf16 %f3, %rs1; +; SM80-NEXT: add.rn.f32 %f4, %f3, %f3; +; SM80-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; SM80-NEXT: mov.b32 %r3, {%rs4, %rs3}; +; SM80-NEXT: abs.bf16x2 %r4, %r3; +; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r4; +; SM80-NEXT: cvt.f32.bf16 %f5, %rs6; +; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f6, %rs8; +; SM80-NEXT: add.rn.f32 %f7, %f5, %f6; +; SM80-NEXT: cvt.rn.bf16.f32 %rs9, %f7; +; SM80-NEXT: cvt.f32.bf16 %f8, %rs5; +; SM80-NEXT: cvt.f32.bf16 %f9, %rs7; +; SM80-NEXT: add.rn.f32 %f10, %f8, %f9; +; SM80-NEXT: cvt.rn.bf16.f32 %rs10, %f10; +; SM80-NEXT: mov.b32 %r5, {%rs10, %rs9}; +; SM80-NEXT: st.param.b32 [func_retval0], %r5; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_fabs_add( +; SM90: { +; SM90-NEXT: .reg .b32 %r<6>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_fabs_add_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [test_fabs_add_param_0]; +; SM90-NEXT: add.rn.bf16x2 %r3, %r2, %r2; +; SM90-NEXT: abs.bf16x2 %r4, %r3; +; SM90-NEXT: add.rn.bf16x2 %r5, %r4, %r1; +; SM90-NEXT: st.param.b32 [func_retval0], %r5; +; SM90-NEXT: ret; %s = fadd <2 x bfloat> %a, %a %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %s) %d = fadd <2 x bfloat> %r, %b ret <2 x bfloat> %d } - -; CHECK-LABEL: test_minnum( -; CHECK-DAG: ld.param.b32 [[AF0:%r[0-9]+]], [test_minnum_param_0]; -; CHECK-DAG: ld.param.b32 [[BF0:%r[0-9]+]], [test_minnum_param_1]; -; CHECK-DAG: min.bf16x2 [[RF0:%r[0-9]+]], [[AF0]], [[BF0]]; -; CHECK: st.param.b32 [func_retval0], [[RF0]]; -; CHECK: ret; define <2 x bfloat> @test_minnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 { +; CHECK-LABEL: test_minnum( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_minnum_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [test_minnum_param_0]; +; CHECK-NEXT: min.bf16x2 %r3, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %r } -; CHECK-LABEL: test_maxnum( -; CHECK-DAG: ld.param.b32 [[AF0:%r[0-9]+]], [test_maxnum_param_0]; -; CHECK-DAG: ld.param.b32 [[BF0:%r[0-9]+]], [test_maxnum_param_1]; -; CHECK-DAG: max.bf16x2 [[RF0:%r[0-9]+]], [[AF0]], [[BF0]]; -; CHECK: st.param.b32 [func_retval0], [[RF0]]; -; CHECK: ret; define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 { +; CHECK-LABEL: test_maxnum( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_maxnum_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [test_maxnum_param_0]; +; CHECK-NEXT: max.bf16x2 %r3, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %r } - - -; CHECK-LABEL: test_floor( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_floor_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; SM90: cvt.rmi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]]; -; SM90: cvt.rmi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]]; -; SM80-DAG: cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]]; -; SM80-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]]; -; SM80-DAG: cvt.rmi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]]; -; SM80-DAG: cvt.rmi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]]; -; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 { +; SM80-LABEL: test_floor( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<5>; +; SM80-NEXT: .reg .b32 %r<3>; +; SM80-NEXT: .reg .f32 %f<5>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_floor_param_0]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs2; +; SM80-NEXT: cvt.rmi.f32.f32 %f2, %f1; +; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; SM80-NEXT: cvt.f32.bf16 %f3, %rs1; +; SM80-NEXT: cvt.rmi.f32.f32 %f4, %f3; +; SM80-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; SM80-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM80-NEXT: st.param.b32 [func_retval0], %r2; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_floor( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<5>; +; SM90-NEXT: .reg .b32 %r<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_floor_param_0]; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM90-NEXT: cvt.rmi.bf16.bf16 %rs3, %rs2; +; SM90-NEXT: cvt.rmi.bf16.bf16 %rs4, %rs1; +; SM90-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; %r = call <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } -; CHECK-LABEL: test_ceil( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_ceil_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; SM90: cvt.rpi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]]; -; SM90: cvt.rpi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]]; -; SM80-DAG: cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]]; -; SM80-DAG: cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]]; -; SM80-DAG: cvt.rpi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]]; -; SM80-DAG: cvt.rpi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]]; -; SM80-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; SM80-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 { +; SM80-LABEL: test_ceil( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<5>; +; SM80-NEXT: .reg .b32 %r<3>; +; SM80-NEXT: .reg .f32 %f<5>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_ceil_param_0]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs2; +; SM80-NEXT: cvt.rpi.f32.f32 %f2, %f1; +; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; SM80-NEXT: cvt.f32.bf16 %f3, %rs1; +; SM80-NEXT: cvt.rpi.f32.f32 %f4, %f3; +; SM80-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; SM80-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM80-NEXT: st.param.b32 [func_retval0], %r2; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_ceil( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<5>; +; SM90-NEXT: .reg .b32 %r<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_ceil_param_0]; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM90-NEXT: cvt.rpi.bf16.bf16 %rs3, %rs2; +; SM90-NEXT: cvt.rpi.bf16.bf16 %rs4, %rs1; +; SM90-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; %r = call <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } -; CHECK-LABEL: test_trunc( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_trunc_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; SM90: cvt.rzi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]]; -; SM90: cvt.rzi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 { +; SM80-LABEL: test_trunc( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<5>; +; SM80-NEXT: .reg .b32 %r<3>; +; SM80-NEXT: .reg .f32 %f<5>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_trunc_param_0]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs2; +; SM80-NEXT: cvt.rzi.f32.f32 %f2, %f1; +; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; SM80-NEXT: cvt.f32.bf16 %f3, %rs1; +; SM80-NEXT: cvt.rzi.f32.f32 %f4, %f3; +; SM80-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; SM80-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM80-NEXT: st.param.b32 [func_retval0], %r2; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_trunc( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<5>; +; SM90-NEXT: .reg .b32 %r<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_trunc_param_0]; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM90-NEXT: cvt.rzi.bf16.bf16 %rs3, %rs2; +; SM90-NEXT: cvt.rzi.bf16.bf16 %rs4, %rs1; +; SM90-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; %r = call <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } -; CHECK-LABEL: test_rint( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_rint_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; SM90: cvt.rni.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]]; -; SM90: cvt.rni.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 { +; SM80-LABEL: test_rint( +; SM80: { +; SM80-NEXT: .reg .b16 %rs<5>; +; SM80-NEXT: .reg .b32 %r<3>; +; SM80-NEXT: .reg .f32 %f<5>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_rint_param_0]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM80-NEXT: cvt.f32.bf16 %f1, %rs2; +; SM80-NEXT: cvt.rni.f32.f32 %f2, %f1; +; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f2; +; SM80-NEXT: cvt.f32.bf16 %f3, %rs1; +; SM80-NEXT: cvt.rni.f32.f32 %f4, %f3; +; SM80-NEXT: cvt.rn.bf16.f32 %rs4, %f4; +; SM80-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM80-NEXT: st.param.b32 [func_retval0], %r2; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_rint( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<5>; +; SM90-NEXT: .reg .b32 %r<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_rint_param_0]; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; SM90-NEXT: cvt.rni.bf16.bf16 %rs3, %rs2; +; SM90-NEXT: cvt.rni.bf16.bf16 %rs4, %rs1; +; SM90-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; %r = call <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } -; CHECK-LABEL: test_round( -; CHECK: ld.param.b32 {{.*}}, [test_round_param_0]; -; check the use of sign mask and 0.5 to implement round -; CHECK: and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648; -; CHECK: or.b32 {{.*}}, [[R1]], 1056964608; -; CHECK: and.b32 [[R2:%r[0-9]+]], {{.*}}, -2147483648; -; CHECK: or.b32 {{.*}}, [[R2]], 1056964608; -; CHECK: st.param.b32 [func_retval0], {{.*}}; -; CHECK: ret; define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 { +; CHECK-LABEL: test_round( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .f32 %f<17>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_round_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2; +; CHECK-NEXT: mov.b32 %r2, %f1; +; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-NEXT: mov.b32 %f2, %r4; +; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; +; CHECK-NEXT: cvt.rzi.f32.f32 %f4, %f3; +; CHECK-NEXT: abs.f32 %f5, %f1; +; CHECK-NEXT: setp.gt.f32 %p1, %f5, 0f4B000000; +; CHECK-NEXT: selp.f32 %f6, %f1, %f4, %p1; +; CHECK-NEXT: cvt.rzi.f32.f32 %f7, %f1; +; CHECK-NEXT: setp.lt.f32 %p2, %f5, 0f3F000000; +; CHECK-NEXT: selp.f32 %f8, %f7, %f6, %p2; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f8; +; CHECK-NEXT: cvt.f32.bf16 %f9, %rs1; +; CHECK-NEXT: mov.b32 %r5, %f9; +; CHECK-NEXT: and.b32 %r6, %r5, -2147483648; +; CHECK-NEXT: or.b32 %r7, %r6, 1056964608; +; CHECK-NEXT: mov.b32 %f10, %r7; +; CHECK-NEXT: add.rn.f32 %f11, %f9, %f10; +; CHECK-NEXT: cvt.rzi.f32.f32 %f12, %f11; +; CHECK-NEXT: abs.f32 %f13, %f9; +; CHECK-NEXT: setp.gt.f32 %p3, %f13, 0f4B000000; +; CHECK-NEXT: selp.f32 %f14, %f9, %f12, %p3; +; CHECK-NEXT: cvt.rzi.f32.f32 %f15, %f9; +; CHECK-NEXT: setp.lt.f32 %p4, %f13, 0f3F000000; +; CHECK-NEXT: selp.f32 %f16, %f15, %f14, %p4; +; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f16; +; CHECK-NEXT: mov.b32 %r8, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) ret <2 x bfloat> %r } -; CHECK-LABEL: test_copysign( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_copysign_param_1]; -; SM80-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; SM80-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; SM80-DAG: abs.bf16 [[AW1:%rs[0-9]+]], [[A1]]; -; SM80-DAG: neg.bf16 [[AY1:%rs[0-9]+]], [[AW1]]; -; SM80-DAG: shr.u16 [[BS1:%rs[0-9]+]], [[B1]], 15; -; SM80-DAG: and.b16 [[BR1:%rs[0-9]+]], [[BS1]], 1; -; SM80-DAG: setp.eq.b16 [[P1:%p[0-9]+]], [[BR1]], 1; -; SM80-DAG: selp.b16 [[RS1:%rs[0-9]+]], [[AY1]], [[AW1]], [[P1]] -; SM80-DAG: abs.bf16 [[AW0:%rs[0-9]+]], [[A0]]; -; SM80-DAG: neg.bf16 [[AY0:%rs[0-9]+]], [[AW0]]; -; SM80-DAG: shr.u16 [[BS0:%rs[0-9]+]], [[B0]], 15; -; SM80-DAG: and.b16 [[BR0:%rs[0-9]+]], [[BS0]], 1; -; SM80-DAG: setp.eq.b16 [[P0:%p[0-9]+]], [[BR0]], 1; -; SM80-DAG: selp.b16 [[RS0:%rs[0-9]+]], [[AY0]], [[AW0]], [[P0]] -; SM80-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS0]], [[RS1]]} -; SM90-DAG: and.b32 [[R1:%r[0-9]+]], [[B]], -2147450880; -; SM90-DAG: and.b32 [[R2:%r[0-9]+]], [[A]], 2147450879; -; SM90-DAG: or.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { +; SM80-LABEL: test_copysign( +; SM80: { +; SM80-NEXT: .reg .pred %p<3>; +; SM80-NEXT: .reg .b16 %rs<17>; +; SM80-NEXT: .reg .b32 %r<4>; +; SM80-EMPTY: +; SM80-NEXT: // %bb.0: +; SM80-NEXT: ld.param.b32 %r1, [test_copysign_param_1]; +; SM80-NEXT: ld.param.b32 %r2, [test_copysign_param_0]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM80-NEXT: abs.bf16 %rs3, %rs2; +; SM80-NEXT: neg.bf16 %rs4, %rs3; +; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM80-NEXT: shr.u16 %rs8, %rs6, 15; +; SM80-NEXT: and.b16 %rs9, %rs8, 1; +; SM80-NEXT: setp.eq.b16 %p1, %rs9, 1; +; SM80-NEXT: selp.b16 %rs10, %rs4, %rs3, %p1; +; SM80-NEXT: abs.bf16 %rs11, %rs1; +; SM80-NEXT: neg.bf16 %rs12, %rs11; +; SM80-NEXT: shr.u16 %rs14, %rs5, 15; +; SM80-NEXT: and.b16 %rs15, %rs14, 1; +; SM80-NEXT: setp.eq.b16 %p2, %rs15, 1; +; SM80-NEXT: selp.b16 %rs16, %rs12, %rs11, %p2; +; SM80-NEXT: mov.b32 %r3, {%rs16, %rs10}; +; SM80-NEXT: st.param.b32 [func_retval0], %r3; +; SM80-NEXT: ret; +; +; SM90-LABEL: test_copysign( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; +; SM90-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; +; SM90-NEXT: and.b32 %r4, %r2, -2147450880; +; SM90-NEXT: and.b32 %r6, %r1, 2147450879; +; SM90-NEXT: or.b32 %r7, %r6, %r4; +; SM90-NEXT: st.param.b32 [func_retval0], %r7; +; SM90-NEXT: ret; %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %r } diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index b11c69e064c4a..eb0b00e883846 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1,325 +1,459 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; ## Full FP16 support enabled by default. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-F16 %s ; RUN: %if ptxas %{ \ -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ ; RUN: %} ; ## FP16 support explicitly disabled. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s ; RUN: %if ptxas %{ \ -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ ; RUN: -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ ; RUN: %} ; ## FP16 is not supported by hardware. -; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s ; RUN: %if ptxas %{ \ -; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \ ; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_52 \ ; RUN: %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -; CHECK-LABEL: test_ret_const( -; CHECK: mov.b32 [[R:%r[0-9+]]], 1073757184; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_ret_const() #0 { +; CHECK-LABEL: test_ret_const( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b32 %r1, 1073757184; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; ret <2 x half> } -; CHECK-LABEL: test_extract_0( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_extract_0_param_0]; -; CHECK: mov.b32 {[[R:%rs[0-9]+]], tmp}, [[A]]; -; CHECK: st.param.b16 [func_retval0], [[R]]; -; CHECK: ret; define half @test_extract_0(<2 x half> %a) #0 { +; CHECK-LABEL: test_extract_0( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i32 0 ret half %e } -; CHECK-LABEL: test_extract_1( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_extract_1_param_0]; -; CHECK: mov.b32 {tmp, [[R:%rs[0-9]+]]}, [[A]]; -; CHECK: st.param.b16 [func_retval0], [[R]]; -; CHECK: ret; define half @test_extract_1(<2 x half> %a) #0 { +; CHECK-LABEL: test_extract_1( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i32 1 ret half %e } -; CHECK-LABEL: test_extract_i( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_extract_i_param_0]; -; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; -; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; -; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]]; -; CHECK: selp.b16 [[R:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]]; -; CHECK: st.param.b16 [func_retval0], [[R]]; -; CHECK: ret; define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { +; CHECK-LABEL: test_extract_i( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; +; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i64 %idx ret half %e } -; CHECK-LABEL: test_fadd( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fadd_param_1]; -; -; CHECK-F16-NEXT: add.rn.f16x2 [[R:%r[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fadd( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fadd_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fadd_param_0]; +; CHECK-F16-NEXT: add.rn.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fadd( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-NEXT: .reg .f32 %f<7>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fadd_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: add.rn.f32 %f3, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NOF16-NEXT: add.rn.f32 %f6, %f5, %f4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NOF16-NEXT: ret; %r = fadd <2 x half> %a, %b ret <2 x half> %r } ; Check that we can lower fadd with immediate arguments. -; CHECK-LABEL: test_fadd_imm_0( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fadd_imm_0_param_0]; -; -; CHECK-F16: mov.b32 [[I:%r[0-9+]]], 1073757184; -; CHECK-F16: add.rn.f16x2 [[R:%r[0-9]+]], [[A]], [[I]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { +; CHECK-F16-LABEL: test_fadd_imm_0( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; +; CHECK-F16-NEXT: mov.b32 %r2, 1073757184; +; CHECK-F16-NEXT: add.rn.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fadd_imm_0( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: add.rn.f32 %f2, %f1, 0f40000000; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %f2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: add.rn.f32 %f4, %f3, 0f3F800000; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs4, %f4; +; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NOF16-NEXT: ret; %r = fadd <2 x half> , %a ret <2 x half> %r } -; CHECK-LABEL: test_fadd_imm_1( -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fadd_imm_1_param_0]; -; -; CHECK-F16: mov.b32 [[I:%r[0-9+]]], 1073757184; -; CHECK-F16: add.rn.f16x2 [[R:%r[0-9]+]], [[B]], [[I]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { +; CHECK-F16-LABEL: test_fadd_imm_1( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fadd_imm_1_param_0]; +; CHECK-F16-NEXT: mov.b32 %r2, 1073757184; +; CHECK-F16-NEXT: add.rn.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fadd_imm_1( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_imm_1_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: add.rn.f32 %f2, %f1, 0f40000000; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %f2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: add.rn.f32 %f4, %f3, 0f3F800000; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs4, %f4; +; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NOF16-NEXT: ret; %r = fadd <2 x half> %a, ret <2 x half> %r } -; CHECK-LABEL: test_fsub( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fsub_param_0]; -; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fsub_param_1]; -; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%r[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fsub( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fsub_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fsub_param_0]; +; CHECK-F16-NEXT: sub.rn.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fsub( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-NEXT: .reg .f32 %f<7>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fsub_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fsub_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: sub.rn.f32 %f3, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NOF16-NEXT: sub.rn.f32 %f6, %f5, %f4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NOF16-NEXT: ret; %r = fsub <2 x half> %a, %b ret <2 x half> %r } -; CHECK-LABEL: test_fneg( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fneg_param_0]; -; -; CHECK-F16: mov.b32 [[I:%r[0-9+]]], 0; -; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%r[0-9]+]], [[I]], [[A]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]]; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_fneg(<2 x half> %a) #0 { +; CHECK-F16-LABEL: test_fneg( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fneg_param_0]; +; CHECK-F16-NEXT: mov.b32 %r2, 0; +; CHECK-F16-NEXT: sub.rn.f16x2 %r3, %r2, %r1; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fneg( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<6>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fneg_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.f32 %f2, 0f00000000; +; CHECK-NOF16-NEXT: sub.rn.f32 %f3, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %f3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NOF16-NEXT: sub.rn.f32 %f5, %f2, %f4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs4, %f5; +; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NOF16-NEXT: ret; %r = fsub <2 x half> , %a ret <2 x half> %r } -; CHECK-LABEL: test_fmul( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fmul_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fmul_param_1]; -; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%r[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fmul( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<4>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fmul_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fmul_param_0]; +; CHECK-F16-NEXT: mul.rn.f16x2 %r3, %r1, %r2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fmul( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-NEXT: .reg .f32 %f<7>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmul_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmul_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: mul.rn.f32 %f3, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NOF16-NEXT: mul.rn.f32 %f6, %f5, %f4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NOF16-NEXT: ret; %r = fmul <2 x half> %a, %b ret <2 x half> %r } -; CHECK-LABEL: test_fdiv( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fdiv_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fdiv_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]; -; CHECK-NEXT: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: test_fdiv( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .f32 %f<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NEXT: div.rn.f32 %f3, %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NEXT: div.rn.f32 %f6, %f5, %f4; +; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = fdiv <2 x half> %a, %b ret <2 x half> %r } -; CHECK-LABEL: test_frem( ; -- Load two 16x2 inputs and split them into f16 elements -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_frem_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_frem_param_1]; ; -- Split into elements -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] ; -- promote to f32. -; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; ; -- frem(a[0],b[0]). -; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-DAG: cvt.rzi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]]; -; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]]; -; CHECK-DAG: sub.f32 [[RFNINF0:%f[0-9]+]], [[FA0]], [[RI0]]; -; CHECK-DAG: testp.infinite.f32 [[ISB0INF:%p[0-9]+]], [[FB0]]; -; CHECK-DAG: selp.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RFNINF0]], [[ISB0INF]]; ; -- frem(a[1],b[1]). -; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-DAG: cvt.rzi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]]; -; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]]; -; CHECK-DAG: sub.f32 [[RFNINF1:%f[0-9]+]], [[FA1]], [[RI1]]; -; CHECK-DAG: testp.infinite.f32 [[ISB1INF:%p[0-9]+]], [[FB1]]; -; CHECK-DAG: selp.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RFNINF1]], [[ISB1INF]]; ; -- convert back to f16. -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; ; -- merge into f16x2 and return it. -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: test_frem( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .f32 %f<15>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_frem_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_frem_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NEXT: div.rn.f32 %f3, %f2, %f1; +; CHECK-NEXT: cvt.rzi.f32.f32 %f4, %f3; +; CHECK-NEXT: mul.f32 %f5, %f4, %f1; +; CHECK-NEXT: sub.f32 %f6, %f2, %f5; +; CHECK-NEXT: testp.infinite.f32 %p1, %f1; +; CHECK-NEXT: selp.f32 %f7, %f2, %f6, %p1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f7; +; CHECK-NEXT: cvt.f32.f16 %f8, %rs1; +; CHECK-NEXT: cvt.f32.f16 %f9, %rs3; +; CHECK-NEXT: div.rn.f32 %f10, %f9, %f8; +; CHECK-NEXT: cvt.rzi.f32.f32 %f11, %f10; +; CHECK-NEXT: mul.f32 %f12, %f11, %f8; +; CHECK-NEXT: sub.f32 %f13, %f9, %f12; +; CHECK-NEXT: testp.infinite.f32 %p2, %f8; +; CHECK-NEXT: selp.f32 %f14, %f9, %f13, %p2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f14; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = frem <2 x half> %a, %b ret <2 x half> %r } -; CHECK-LABEL: .func test_ldst_v2f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1]; -; CHECK-DAG: ld.b32 [[E:%r[0-9]+]], [%[[A]]] -; CHECK-DAG: st.b32 [%[[B]]], [[E]]; -; CHECK: ret; define void @test_ldst_v2f16(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v2f16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f16_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f16_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; +; CHECK-NEXT: st.b32 [%rd2], %r1; +; CHECK-NEXT: ret; %t1 = load <2 x half>, ptr %a store <2 x half> %t1, ptr %b, align 16 ret void } -; CHECK-LABEL: .func test_ldst_v3f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1]; ; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair ; number of bitshifting instructions that may change at llvm's whim. ; So we only verify that we only issue correct number of writes using ; correct offset, but not the values we write. -; CHECK-DAG: ld.u64 -; CHECK-DAG: st.u32 [%[[B]]], -; CHECK-DAG: st.b16 [%[[B]]+4], -; CHECK: ret; define void @test_ldst_v3f16(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v3f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3f16_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3f16_param_0]; +; CHECK-NEXT: ld.u64 %rd3, [%rd1]; +; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd3; } +; CHECK-NEXT: st.u32 [%rd2], %rd3; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } +; CHECK-NEXT: st.b16 [%rd2+4], %rs1; +; CHECK-NEXT: ret; %t1 = load <3 x half>, ptr %a store <3 x half> %t1, ptr %b, align 16 ret void } -; CHECK-LABEL: .func test_ldst_v4f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1]; -; CHECK-DAG: ld.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [%[[A]]]; -; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: ret; define void @test_ldst_v4f16(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v4f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4f16_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4f16_param_0]; +; CHECK-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: st.v4.b16 [%rd2], {%rs1, %rs2, %rs3, %rs4}; +; CHECK-NEXT: ret; %t1 = load <4 x half>, ptr %a store <4 x half> %t1, ptr %b, align 16 ret void } -; CHECK-LABEL: .func test_ldst_v8f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1]; -; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; -; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: ret; define void @test_ldst_v8f16(ptr %a, ptr %b) { +; CHECK-LABEL: test_ldst_v8f16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8f16_param_1]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8f16_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; %t1 = load <8 x half>, ptr %a store <8 x half> %t1, ptr %b, align 16 ret void @@ -327,704 +461,1210 @@ define void @test_ldst_v8f16(ptr %a, ptr %b) { declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 -; CHECK-LABEL: test_call( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_call_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_call_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0], [[A]]; -; CHECK-DAG: st.param.b32 [param1], [[B]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: test_call( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .align 4 .b8 param1[4]; +; CHECK-NEXT: st.param.b32 [param1], %r2; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) ret <2 x half> %r } -; CHECK-LABEL: test_call_flipped( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_call_flipped_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_call_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0], [[B]]; -; CHECK-DAG: st.param.b32 [param1], [[A]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: test_call_flipped( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r2; +; CHECK-NEXT: .param .align 4 .b8 param1[4]; +; CHECK-NEXT: st.param.b32 [param1], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_tailcall_flipped( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_tailcall_flipped_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_tailcall_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0], [[B]]; -; CHECK-DAG: st.param.b32 [param1], [[A]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: test_tailcall_flipped( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; +; CHECK-NEXT: { // callseq 2, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r2; +; CHECK-NEXT: .param .align 4 .b8 param1[4]; +; CHECK-NEXT: st.param.b32 [param1], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: } // callseq 2 +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_select( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_select_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_select_param_1]; -; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] -; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; -; CHECK-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { +; CHECK-LABEL: test_select( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_0]; +; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = select i1 %c, <2 x half> %a, <2 x half> %b ret <2 x half> %r } -; CHECK-LABEL: test_select_cc( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_select_cc_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_select_cc_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%r[0-9]+]], [test_select_cc_param_2]; -; CHECK-DAG: ld.param.b32 [[D:%r[0-9]+]], [test_select_cc_param_3]; -; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] -; -; CHECK-NOF16-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] -; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 { +; CHECK-F16-LABEL: test_select_cc( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<7>; +; CHECK-F16-NEXT: .reg .b32 %r<6>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; CHECK-F16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r3, %r4; +; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-F16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-F16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; CHECK-F16-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; +; CHECK-F16-NEXT: mov.b32 %r5, {%rs6, %rs5}; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_select_cc( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<11>; +; CHECK-NOF16-NEXT: .reg .b32 %r<6>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs1; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs3; +; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs4; +; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; CHECK-NOF16-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; +; CHECK-NOF16-NEXT: mov.b32 %r5, {%rs10, %rs9}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NOF16-NEXT: ret; %cc = fcmp une <2 x half> %c, %d %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b ret <2 x half> %r } -; CHECK-LABEL: test_select_cc_f32_f16( -; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0]; -; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%r[0-9]+]], [test_select_cc_f32_f16_param_2]; -; CHECK-DAG: ld.param.b32 [[D:%r[0-9]+]], [test_select_cc_f32_f16_param_3]; -; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] -; -; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, +; CHECK-F16-LABEL: test_select_cc_f32_f16( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-NEXT: .reg .f32 %f<7>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1]; +; CHECK-F16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; +; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.f32 %f5, %f2, %f4, %p2; +; CHECK-F16-NEXT: selp.f32 %f6, %f1, %f3, %p1; +; CHECK-F16-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5}; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_select_cc_f32_f16( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<11>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1]; +; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs1; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f6, %rs3; +; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %f6, %f5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f7, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f8, %rs4; +; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %f8, %f7; +; CHECK-NOF16-NEXT: selp.f32 %f9, %f2, %f4, %p2; +; CHECK-NOF16-NEXT: selp.f32 %f10, %f1, %f3, %p1; +; CHECK-NOF16-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9}; +; CHECK-NOF16-NEXT: ret; <2 x half> %c, <2 x half> %d) #0 { %cc = fcmp une <2 x half> %c, %d %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b ret <2 x float> %r } -; CHECK-LABEL: test_select_cc_f16_f32( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_select_cc_f16_f32_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_select_cc_f16_f32_param_1]; -; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2]; -; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3]; -; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, +; CHECK-LABEL: test_select_cc_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f16_f32_param_3]; +; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f16_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0]; +; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3; +; CHECK-NEXT: setp.neu.f32 %p2, %f2, %f4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; CHECK-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; <2 x float> %c, <2 x float> %d) #0 { %cc = fcmp une <2 x float> %c, %d %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b ret <2 x half> %r } -; CHECK-LABEL: test_fcmp_une( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_une_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_une_param_1]; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_une( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_une_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_une_param_0]; +; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_une( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_une_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_une_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp une <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_ueq( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_ueq_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_ueq_param_1]; -; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_ueq( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_ueq_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_ueq_param_0]; +; CHECK-F16-NEXT: setp.equ.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_ueq( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ueq_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ueq_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.equ.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.equ.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp ueq <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_ugt( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_ugt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_ugt_param_1]; -; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_ugt( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_ugt_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_ugt_param_0]; +; CHECK-F16-NEXT: setp.gtu.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_ugt( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ugt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ugt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.gtu.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.gtu.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp ugt <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_uge( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_uge_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_uge_param_1]; -; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_uge( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_uge_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_uge_param_0]; +; CHECK-F16-NEXT: setp.geu.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_uge( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_uge_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_uge_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.geu.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.geu.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp uge <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_ult( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_ult_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_ult_param_1]; -; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_ult( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_ult_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_ult_param_0]; +; CHECK-F16-NEXT: setp.ltu.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_ult( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ult_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ult_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.ltu.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.ltu.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp ult <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_ule( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_ule_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_ule_param_1]; -; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_ule( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_ule_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_ule_param_0]; +; CHECK-F16-NEXT: setp.leu.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_ule( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ule_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ule_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.leu.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.leu.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp ule <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_uno( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_uno_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_uno_param_1]; -; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_uno( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_uno_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_uno_param_0]; +; CHECK-F16-NEXT: setp.nan.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_uno( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_uno_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_uno_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp uno <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_one( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_one_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_one_param_1]; -; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_one( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_one_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_one_param_0]; +; CHECK-F16-NEXT: setp.ne.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_one( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_one_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_one_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.ne.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.ne.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp one <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_oeq( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_oeq_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_oeq_param_1]; -; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_oeq( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_oeq_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_oeq_param_0]; +; CHECK-F16-NEXT: setp.eq.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_oeq( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_oeq_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_oeq_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.eq.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp oeq <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_ogt( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_ogt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_ogt_param_1]; -; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_ogt( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_ogt_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_ogt_param_0]; +; CHECK-F16-NEXT: setp.gt.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_ogt( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ogt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ogt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.gt.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.gt.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp ogt <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_oge( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_oge_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_oge_param_1]; -; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_oge( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_oge_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_oge_param_0]; +; CHECK-F16-NEXT: setp.ge.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_oge( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_oge_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_oge_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.ge.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.ge.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp oge <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_olt( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_olt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_olt_param_1]; -; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_olt( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_olt_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_olt_param_0]; +; CHECK-F16-NEXT: setp.lt.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_olt( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_olt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_olt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.lt.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp olt <2 x half> %a, %b ret <2 x i1> %r } -; XCHECK-LABEL: test_fcmp_ole( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_ole_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_ole_param_1]; -; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_ole( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_ole_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_ole_param_0]; +; CHECK-F16-NEXT: setp.le.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_ole( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ole_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ole_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.le.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.le.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp ole <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fcmp_ord( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fcmp_ord_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fcmp_ord_param_1]; -; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-NEXT: st.param.b8 [func_retval0], [[R0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; -; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_fcmp_ord( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .pred %p<3>; +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fcmp_ord_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fcmp_ord_param_0]; +; CHECK-F16-NEXT: setp.num.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-F16-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-F16-NEXT: selp.u16 %rs2, -1, 0, %p2; +; CHECK-F16-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fcmp_ord( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .pred %p<3>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ord_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ord_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: setp.num.f32 %p1, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs3; +; CHECK-NOF16-NEXT: setp.num.f32 %p2, %f4, %f3; +; CHECK-NOF16-NEXT: selp.u16 %rs5, -1, 0, %p2; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; +; CHECK-NOF16-NEXT: selp.u16 %rs6, -1, 0, %p1; +; CHECK-NOF16-NEXT: st.param.b8 [func_retval0+1], %rs6; +; CHECK-NOF16-NEXT: ret; %r = fcmp ord <2 x half> %a, %b ret <2 x i1> %r } -; CHECK-LABEL: test_fptosi_i32( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fptosi_i32_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]} -; CHECK: ret; define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { +; CHECK-LABEL: test_fptosi_i32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fptosi_i32_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rzi.s32.f16 %r2, %rs2; +; CHECK-NEXT: cvt.rzi.s32.f16 %r3, %rs1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; +; CHECK-NEXT: ret; %r = fptosi <2 x half> %a to <2 x i32> ret <2 x i32> %r } -; CHECK-LABEL: test_fptosi_i64( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fptosi_i64_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]} -; CHECK: ret; define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { +; CHECK-LABEL: test_fptosi_i64( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fptosi_i64_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rzi.s64.f16 %rd1, %rs2; +; CHECK-NEXT: cvt.rzi.s64.f16 %rd2, %rs1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NEXT: ret; %r = fptosi <2 x half> %a to <2 x i64> ret <2 x i64> %r } -; CHECK-LABEL: test_fptoui_2xi32( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fptoui_2xi32_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]} -; CHECK: ret; define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { +; CHECK-LABEL: test_fptoui_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rzi.u32.f16 %r2, %rs2; +; CHECK-NEXT: cvt.rzi.u32.f16 %r3, %rs1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; +; CHECK-NEXT: ret; %r = fptoui <2 x half> %a to <2 x i32> ret <2 x i32> %r } -; CHECK-LABEL: test_fptoui_2xi64( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fptoui_2xi64_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]} -; CHECK: ret; define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { +; CHECK-LABEL: test_fptoui_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rzi.u64.f16 %rd1, %rs2; +; CHECK-NEXT: cvt.rzi.u64.f16 %rd2, %rs1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; CHECK-NEXT: ret; %r = fptoui <2 x half> %a to <2 x i64> ret <2 x i64> %r } -; CHECK-LABEL: test_uitofp_2xi32( -; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0]; -; CHECK-DAG: cvt.rn.f16.u32 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.u32 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; +; CHECK-NEXT: cvt.rn.f16.u32 %rs1, %r2; +; CHECK-NEXT: cvt.rn.f16.u32 %rs2, %r1; +; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_uitofp_2xi64( -; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0]; -; CHECK-DAG: cvt.rn.f16.u64 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.u64 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0]; +; CHECK-NEXT: cvt.rn.f16.u64 %rs1, %rd2; +; CHECK-NEXT: cvt.rn.f16.u64 %rs2, %rd1; +; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %r = uitofp <2 x i64> %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_sitofp_2xi32( -; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0]; -; CHECK-DAG: cvt.rn.f16.s32 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.s32 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; +; CHECK-NEXT: cvt.rn.f16.s32 %rs1, %r2; +; CHECK-NEXT: cvt.rn.f16.s32 %rs2, %r1; +; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_sitofp_2xi64( -; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0]; -; CHECK-DAG: cvt.rn.f16.s64 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.s64 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0]; +; CHECK-NEXT: cvt.rn.f16.s64 %rs1, %rd2; +; CHECK-NEXT: cvt.rn.f16.s64 %rs2, %rd1; +; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %r = sitofp <2 x i64> %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_uitofp_2xi32_fadd( -; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_uitofp_2xi32_fadd_param_1]; -; CHECK-DAG: cvt.rn.f16.u32 [[C0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.u32 [[C1:%rs[0-9]+]], [[A1]]; -; CHECK-F16-DAG: mov.b32 [[C:%r[0-9]+]], {[[C0]], [[C1]]} -; CHECK-F16-DAG: add.rn.f16x2 [[R:%r[0-9]+]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_uitofp_2xi32_fadd( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<6>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-F16-NEXT: cvt.rn.f16.u32 %rs1, %r2; +; CHECK-F16-NEXT: cvt.rn.f16.u32 %rs2, %r1; +; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1}; +; CHECK-F16-NEXT: add.rn.f16x2 %r5, %r3, %r4; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_uitofp_2xi32_fadd( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .f32 %f<7>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs1, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs2, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: add.rn.f32 %f3, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NOF16-NEXT: add.rn.f32 %f6, %f5, %f4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NOF16-NEXT: ret; %c = uitofp <2 x i32> %a to <2 x half> %r = fadd <2 x half> %b, %c ret <2 x half> %r } -; CHECK-LABEL: test_sitofp_2xi32_fadd( -; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_sitofp_2xi32_fadd_param_1]; -; CHECK-DAG: cvt.rn.f16.s32 [[C0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.s32 [[C1:%rs[0-9]+]], [[A1]]; -; -; CHECK-F16-DAG: mov.b32 [[C:%r[0-9]+]], {[[C0]], [[C1]]} -; CHECK-F16-DAG: add.rn.f16x2 [[R:%r[0-9]+]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_sitofp_2xi32_fadd( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<6>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1]; +; CHECK-F16-NEXT: cvt.rn.f16.s32 %rs1, %r2; +; CHECK-F16-NEXT: cvt.rn.f16.s32 %rs2, %r1; +; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1}; +; CHECK-F16-NEXT: add.rn.f16x2 %r5, %r3, %r4; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_sitofp_2xi32_fadd( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .f32 %f<7>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1]; +; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs1, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs2, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: add.rn.f32 %f3, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NOF16-NEXT: add.rn.f32 %f6, %f5, %f4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NOF16-NEXT: ret; %c = sitofp <2 x i32> %a to <2 x half> %r = fadd <2 x half> %b, %c ret <2 x half> %r } -; CHECK-LABEL: test_fptrunc_2xfloat( -; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptrunc_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %f2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f1; +; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %r = fptrunc <2 x float> %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_fptrunc_2xdouble( -; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0]; -; CHECK-DAG: cvt.rn.f16.f64 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.f64 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { +; CHECK-LABEL: test_fptrunc_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-NEXT: cvt.rn.f16.f64 %rs1, %fd2; +; CHECK-NEXT: cvt.rn.f16.f64 %rs2, %fd1; +; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %r = fptrunc <2 x double> %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_fpext_2xfloat( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fpext_2xfloat_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]}; -; CHECK: ret; define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { +; CHECK-LABEL: test_fpext_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; +; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; +; CHECK-NEXT: ret; %r = fpext <2 x half> %a to <2 x float> ret <2 x float> %r } -; CHECK-LABEL: test_fpext_2xdouble( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fpext_2xdouble_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f64 [func_retval0], {[[R0]], [[R1]]}; -; CHECK: ret; define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { +; CHECK-LABEL: test_fpext_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f64.f16 %fd1, %rs2; +; CHECK-NEXT: cvt.f64.f16 %fd2, %rs1; +; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1}; +; CHECK-NEXT: ret; %r = fpext <2 x half> %a to <2 x double> ret <2 x double> %r } -; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; -; CHECK: st.param.b32 [func_retval0], [[A]] -; CHECK: ret; define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { +; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_2xhalf_to_2xi16_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = bitcast <2 x half> %a to <2 x i16> ret <2 x i16> %r } -; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( -; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xi16_to_2xhalf_param_0]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { +; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = bitcast <2 x i16> %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_bitcast_float_to_2xhalf( -; CHECK: ld.param.f32 [[AF1:%f[0-9]+]], [test_bitcast_float_to_2xhalf_param_0]; -; CHECK: mov.b32 [[R:%r[0-9]+]], [[AF1]]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 { +; CHECK-LABEL: test_bitcast_float_to_2xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [test_bitcast_float_to_2xhalf_param_0]; +; CHECK-NEXT: mov.b32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %r = bitcast float %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_bitcast_2xhalf_to_float( -; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xhalf_to_float_param_0]; -; CHECK: mov.b32 [[AF1:%f[0-9]+]], [[R]]; -; CHECK: st.param.f32 [func_retval0], [[AF1]]; -; CHECK: ret; define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 { +; CHECK-LABEL: test_bitcast_2xhalf_to_float( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_2xhalf_to_float_param_0]; +; CHECK-NEXT: mov.b32 %f1, %r2; +; CHECK-NEXT: st.param.f32 [func_retval0], %f1; +; CHECK-NEXT: ret; %r = bitcast <2 x half> %a to float ret float %r } @@ -1053,19 +1693,25 @@ declare <2 x half> @llvm.round.f16(<2 x half> %a) #0 declare <2 x half> @llvm.roundeven.f16(<2 x half> %a) #0 declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 -; CHECK-LABEL: test_sqrt( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_sqrt_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_sqrt(<2 x half> %a) #0 { +; CHECK-LABEL: test_sqrt( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_sqrt_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: sqrt.rn.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2; +; CHECK-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NEXT: sqrt.rn.f32 %f4, %f3; +; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a) ret <2 x half> %r } @@ -1077,36 +1723,48 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 { ; ret <2 x half> %r ;} -; CHECK-LABEL: test_sin( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_sin_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_sin(<2 x half> %a) #0 #1 { +; CHECK-LABEL: test_sin( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_sin_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: sin.approx.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2; +; CHECK-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NEXT: sin.approx.f32 %f4, %f3; +; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.sin.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_cos( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_cos_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_cos(<2 x half> %a) #0 #1 { +; CHECK-LABEL: test_cos( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_cos_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: cos.approx.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2; +; CHECK-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NEXT: cos.approx.f32 %f4, %f3; +; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.cos.f16(<2 x half> %a) ret <2 x half> %r } @@ -1153,355 +1811,579 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 { ; ret <2 x half> %r ;} -; CHECK-LABEL: test_fma( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fma_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fma_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%r[0-9]+]], [test_fma_param_2]; -; -; CHECK-F16: fma.rn.f16x2 [[R:%r[0-9]+]], [[A]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} - -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret + define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { +; CHECK-F16-LABEL: test_fma( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<5>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r3, [test_fma_param_2]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fma_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fma_param_0]; +; CHECK-F16-NEXT: fma.rn.f16x2 %r4, %r1, %r2, %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fma( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .f32 %f<9>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fma_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fma_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fma_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs6; +; CHECK-NOF16-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %f4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f7, %rs5; +; CHECK-NOF16-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %f8; +; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs8, %rs7}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) ret <2 x half> %r } -; CHECK-LABEL: test_fabs( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0]; -; CHECK-NOF16: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-NOF16-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-NOF16-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-F16: and.b32 [[R:%r[0-9]+]], [[A]], 2147450879; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_fabs(<2 x half> %a) #0 { +; CHECK-F16-LABEL: test_fabs( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<5>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; +; CHECK-F16-NEXT: and.b32 %r3, %r1, 2147450879; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fabs( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<5>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: abs.f32 %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %f2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NOF16-NEXT: abs.f32 %f4, %f3; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs4, %f4; +; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_minnum( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_minnum_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_minnum_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; -; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: test_minnum( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .f32 %f<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_minnum_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_minnum_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NEXT: min.f32 %f3, %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NEXT: min.f32 %f6, %f5, %f4; +; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %r } -; CHECK-LABEL: test_maxnum( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_maxnum_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_maxnum_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; -; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: test_maxnum( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .f32 %f<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r2, [test_maxnum_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_maxnum_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NEXT: max.f32 %f3, %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NEXT: cvt.f32.f16 %f4, %rs1; +; CHECK-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NEXT: max.f32 %f6, %f5, %f4; +; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %r } -; CHECK-LABEL: test_copysign( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_copysign_param_1]; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: and.b16 [[AX0:%rs[0-9]+]], [[A0]], 32767; -; CHECK-NOF16-DAG: and.b16 [[AX1:%rs[0-9]+]], [[A1]], 32767; -; CHECK-NOF16-DAG: and.b16 [[BX0:%rs[0-9]+]], [[B0]], -32768; -; CHECK-NOF16-DAG: and.b16 [[BX1:%rs[0-9]+]], [[B1]], -32768; -; CHECK-NOF16-DAG: or.b16 [[R0:%rs[0-9]+]], [[AX0]], [[BX0]]; -; CHECK-NOF16-DAG: or.b16 [[R1:%rs[0-9]+]], [[AX1]], [[BX1]]; -; CHECK-NOF16-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-F16-DAG: and.b32 [[R0:%r[0-9]+]], [[B]], -2147450880; -; CHECK-F16-DAG: and.b32 [[R1:%r[0-9]+]], [[A]], 2147450879; -; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R1]], [[R0]] -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_copysign( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; +; CHECK-F16-NEXT: and.b32 %r4, %r2, -2147450880; +; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; +; CHECK-F16-NEXT: or.b32 %r7, %r6, %r4; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_copysign( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: and.b16 %rs4, %rs2, -32768; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs6, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs9, %rs8, %rs4; +; CHECK-NOF16-NEXT: and.b16 %rs12, %rs1, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs14, %rs5, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs15, %rs14, %rs12; +; CHECK-NOF16-NEXT: mov.b32 %r3, {%rs15, %rs9}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %r } -; CHECK-LABEL: test_copysign_f32( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_f32_param_0]; -; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1]; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]]; -; CHECK-NOF16-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]]; -; CHECK-NOF16-DAG: and.b16 [[AI0:%rs[0-9]+]], [[A0]], 32767; -; CHECK-NOF16-DAG: and.b16 [[AI1:%rs[0-9]+]], [[A1]], 32767; -; CHECK-NOF16-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; -; CHECK-NOF16-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; -; CHECK-NOF16-DAG: mov.b32 {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; } -; CHECK-NOF16-DAG: mov.b32 {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; } -; CHECK-NOF16-DAG: or.b16 [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]]; -; CHECK-NOF16-DAG: or.b16 [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]]; -; CHECK-NOF16-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-F16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[B1]]; -; CHECK-F16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[B0]]; -; CHECK-F16-DAG: mov.b32 [[R2:%r[0-9]+]], {[[R1]], [[R0]]}; -; CHECK-F16-DAG: and.b32 [[R3:%r[0-9]+]], [[R2]], -2147450880; -; CHECK-F16-DAG: and.b32 [[R4:%r[0-9]+]], [[A]], 2147450879; -; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R4]], [[R3]] -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { +; CHECK-F16-LABEL: test_copysign_f32( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .f32 %f<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; +; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %f2; +; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %f1; +; CHECK-F16-NEXT: mov.b32 %r2, {%rs2, %rs1}; +; CHECK-F16-NEXT: and.b32 %r4, %r2, -2147450880; +; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; +; CHECK-F16-NEXT: or.b32 %r7, %r6, %r4; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_copysign_f32( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<13>; +; CHECK-NOF16-NEXT: .reg .b32 %r<7>; +; CHECK-NOF16-NEXT: .reg .f32 %f<3>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; +; CHECK-NOF16-NEXT: mov.b32 %r2, %f2; +; CHECK-NOF16-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r3; } +; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs5, %rs3, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs6, %rs5, %rs1; +; CHECK-NOF16-NEXT: mov.b32 %r4, %f1; +; CHECK-NOF16-NEXT: and.b32 %r5, %r4, -2147483648; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; } +; CHECK-NOF16-NEXT: and.b16 %rs10, %rs2, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs11, %rs10, %rs8; +; CHECK-NOF16-NEXT: mov.b32 %r6, {%rs11, %rs6}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NOF16-NEXT: ret; %tb = fptrunc <2 x float> %b to <2 x half> %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) ret <2 x half> %r } -; CHECK-LABEL: test_copysign_f64( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_f64_param_0]; -; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1]; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]]; -; CHECK-NOF16-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]]; -; CHECK-NOF16-DAG: and.b16 [[AI0:%rs[0-9]+]], [[A0]], 32767; -; CHECK-NOF16-DAG: and.b16 [[AI1:%rs[0-9]+]], [[A1]], 32767; -; CHECK-NOF16-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808; -; CHECK-NOF16-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808; -; CHECK-NOF16-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48; -; CHECK-NOF16-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48; -; CHECK-NOF16-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]]; -; CHECK-NOF16-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]]; -; CHECK-NOF16-DAG: or.b16 [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]]; -; CHECK-NOF16-DAG: or.b16 [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]]; -; CHECK-NOF16-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-F16-DAG: cvt.rn.f16.f64 [[R0:%rs[0-9]+]], [[B1]]; -; CHECK-F16-DAG: cvt.rn.f16.f64 [[R1:%rs[0-9]+]], [[B0]]; -; CHECK-F16-DAG: mov.b32 [[R2:%r[0-9]+]], {[[R1]], [[R0]]}; -; CHECK-F16-DAG: and.b32 [[R3:%r[0-9]+]], [[R2]], -2147450880; -; CHECK-F16-DAG: and.b32 [[R4:%r[0-9]+]], [[A]], 2147450879; -; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R4]], [[R3]]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { +; CHECK-F16-LABEL: test_copysign_f64( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .f64 %fd<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; +; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs1, %fd2; +; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs2, %fd1; +; CHECK-F16-NEXT: mov.b32 %r2, {%rs2, %rs1}; +; CHECK-F16-NEXT: and.b32 %r4, %r2, -2147450880; +; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; +; CHECK-F16-NEXT: or.b32 %r7, %r6, %r4; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_copysign_f64( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<13>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .b64 %rd<7>; +; CHECK-NOF16-NEXT: .reg .f64 %fd<3>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs4, %rs2, 32767; +; CHECK-NOF16-NEXT: mov.b64 %rd1, %fd2; +; CHECK-NOF16-NEXT: and.b64 %rd2, %rd1, -9223372036854775808; +; CHECK-NOF16-NEXT: shr.u64 %rd3, %rd2, 48; +; CHECK-NOF16-NEXT: cvt.u16.u64 %rs5, %rd3; +; CHECK-NOF16-NEXT: or.b16 %rs6, %rs4, %rs5; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs1, 32767; +; CHECK-NOF16-NEXT: mov.b64 %rd4, %fd1; +; CHECK-NOF16-NEXT: and.b64 %rd5, %rd4, -9223372036854775808; +; CHECK-NOF16-NEXT: shr.u64 %rd6, %rd5, 48; +; CHECK-NOF16-NEXT: cvt.u16.u64 %rs10, %rd6; +; CHECK-NOF16-NEXT: or.b16 %rs11, %rs9, %rs10; +; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs11, %rs6}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NOF16-NEXT: ret; %tb = fptrunc <2 x double> %b to <2 x half> %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) ret <2 x half> %r } -; CHECK-LABEL: test_copysign_extended( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_extended_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_copysign_extended_param_1]; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: and.b16 [[AX0:%rs[0-9]+]], [[A0]], 32767; -; CHECK-NOF16-DAG: and.b16 [[AX1:%rs[0-9]+]], [[A1]], 32767; -; CHECK-NOF16-DAG: and.b16 [[BX0:%rs[0-9]+]], [[B0]], -32768; -; CHECK-NOF16-DAG: and.b16 [[BX1:%rs[0-9]+]], [[B1]], -32768; -; CHECK-NOF16-DAG: or.b16 [[R0:%rs[0-9]+]], [[AX0]], [[BX0]]; -; CHECK-NOF16-DAG: or.b16 [[R1:%rs[0-9]+]], [[AX1]], [[BX1]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[R0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[R1]]; -; CHECK-F16-DAG: and.b32 [[R0:%r[0-9]+]], [[B]], -2147450880; -; CHECK-F16-DAG: and.b32 [[R1:%r[0-9]+]], [[A]], 2147450879; -; CHECK-F16-DAG: or.b32 [[R2:%r[0-9]+]], [[R1]], [[R0]] -; CHECK-F16-DAG: mov.b32 {[[R3:%rs[0-9]+]], [[R4:%rs[0-9]+]]}, [[R2]] -; CHECK-F16-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[R3]] -; CHECK-F16-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[R4]] -; CHECK: st.param.v2.f32 [func_retval0], {[[XR0]], [[XR1]]}; -; CHECK: ret; define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-F16-LABEL: test_copysign_extended( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .f32 %f<3>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; +; CHECK-F16-NEXT: and.b32 %r4, %r2, -2147450880; +; CHECK-F16-NEXT: and.b32 %r6, %r1, 2147450879; +; CHECK-F16-NEXT: or.b32 %r7, %r6, %r4; +; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-F16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-F16-NEXT: cvt.f32.f16 %f2, %rs1; +; CHECK-F16-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_copysign_extended( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; +; CHECK-NOF16-NEXT: .reg .b32 %r<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<3>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: and.b16 %rs4, %rs1, -32768; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs5, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs9, %rs8, %rs4; +; CHECK-NOF16-NEXT: and.b16 %rs12, %rs2, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs14, %rs6, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs15, %rs14, %rs12; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs15; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs9; +; CHECK-NOF16-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; +; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) %xr = fpext <2 x half> %r to <2 x float> ret <2 x float> %xr } -; CHECK-LABEL: test_floor( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_floor_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_floor(<2 x half> %a) #0 { +; CHECK-LABEL: test_floor( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_floor_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rmi.f16.f16 %rs3, %rs2; +; CHECK-NEXT: cvt.rmi.f16.f16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.floor.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_ceil( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_ceil_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_ceil(<2 x half> %a) #0 { +; CHECK-LABEL: test_ceil( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_ceil_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rpi.f16.f16 %rs3, %rs2; +; CHECK-NEXT: cvt.rpi.f16.f16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_trunc( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_trunc_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_trunc(<2 x half> %a) #0 { +; CHECK-LABEL: test_trunc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_trunc_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rzi.f16.f16 %rs3, %rs2; +; CHECK-NEXT: cvt.rzi.f16.f16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_rint( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_rint_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_rint(<2 x half> %a) #0 { +; CHECK-LABEL: test_rint( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_rint_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; +; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.rint.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_nearbyint( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_nearbyint_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_nearbyint(<2 x half> %a) #0 { +; CHECK-LABEL: test_nearbyint( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_nearbyint_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; +; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_roundeven( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_roundeven_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_roundeven(<2 x half> %a) #0 { +; CHECK-LABEL: test_roundeven( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_roundeven_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; +; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.roundeven.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_round( -; CHECK: ld.param.b32 {{.*}}, [test_round_param_0]; ; check the use of sign mask and 0.5 to implement round -; CHECK: and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648; -; CHECK: or.b32 {{.*}}, [[R1]], 1056964608; -; CHECK: and.b32 [[R2:%r[0-9]+]], {{.*}}, -2147483648; -; CHECK: or.b32 {{.*}}, [[R2]], 1056964608; -; CHECK: st.param.b32 [func_retval0], {{.*}}; -; CHECK: ret; define <2 x half> @test_round(<2 x half> %a) #0 { +; CHECK-LABEL: test_round( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .f32 %f<17>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_round_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: mov.b32 %r2, %f1; +; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-NEXT: mov.b32 %f2, %r4; +; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; +; CHECK-NEXT: cvt.rzi.f32.f32 %f4, %f3; +; CHECK-NEXT: abs.f32 %f5, %f1; +; CHECK-NEXT: setp.gt.f32 %p1, %f5, 0f4B000000; +; CHECK-NEXT: selp.f32 %f6, %f1, %f4, %p1; +; CHECK-NEXT: cvt.rzi.f32.f32 %f7, %f1; +; CHECK-NEXT: setp.lt.f32 %p2, %f5, 0f3F000000; +; CHECK-NEXT: selp.f32 %f8, %f7, %f6, %p2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f8; +; CHECK-NEXT: cvt.f32.f16 %f9, %rs1; +; CHECK-NEXT: mov.b32 %r5, %f9; +; CHECK-NEXT: and.b32 %r6, %r5, -2147483648; +; CHECK-NEXT: or.b32 %r7, %r6, 1056964608; +; CHECK-NEXT: mov.b32 %f10, %r7; +; CHECK-NEXT: add.rn.f32 %f11, %f9, %f10; +; CHECK-NEXT: cvt.rzi.f32.f32 %f12, %f11; +; CHECK-NEXT: abs.f32 %f13, %f9; +; CHECK-NEXT: setp.gt.f32 %p3, %f13, 0f4B000000; +; CHECK-NEXT: selp.f32 %f14, %f9, %f12, %p3; +; CHECK-NEXT: cvt.rzi.f32.f32 %f15, %f9; +; CHECK-NEXT: setp.lt.f32 %p4, %f13, 0f3F000000; +; CHECK-NEXT: selp.f32 %f16, %f15, %f14, %p4; +; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f16; +; CHECK-NEXT: mov.b32 %r8, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %r = call <2 x half> @llvm.round.f16(<2 x half> %a) ret <2 x half> %r } -; CHECK-LABEL: test_fmuladd( -; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_fmuladd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_fmuladd_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%r[0-9]+]], [test_fmuladd_param_2]; -; -; CHECK-F16: fma.rn.f16x2 [[R:%r[0-9]+]], [[A]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { +; CHECK-F16-LABEL: test_fmuladd( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<5>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r3, [test_fmuladd_param_2]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fmuladd_param_0]; +; CHECK-F16-NEXT: fma.rn.f16x2 %r4, %r1, %r2, %r3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-F16-NEXT: ret; +; +; CHECK-NOF16-LABEL: test_fmuladd( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .f32 %f<9>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fmuladd_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmuladd_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs4; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs6; +; CHECK-NOF16-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %f4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f7, %rs5; +; CHECK-NOF16-NEXT: fma.rn.f32 %f8, %f7, %f6, %f5; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %f8; +; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs8, %rs7}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) ret <2 x half> %r } -; CHECK-LABEL: test_shufflevector( -; CHECK: mov.b32 {%rs1, %rs2}, %r1; -; CHECK: mov.b32 %r2, {%rs2, %rs1}; define <2 x half> @test_shufflevector(<2 x half> %a) #0 { +; CHECK-LABEL: test_shufflevector( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: mov.b32 %r2, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> ret <2 x half> %s } -; CHECK-LABEL: test_insertelement( -; CHECK: mov.b32 {%rs2, tmp}, %r1; -; CHECK: mov.b32 %r2, {%rs2, %rs1}; define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 { +; CHECK-LABEL: test_insertelement( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } +; CHECK-NEXT: mov.b32 %r2, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %i = insertelement <2 x half> %a, half %x, i64 1 ret <2 x half> %i } -; CHECK-LABEL: test_sitofp_2xi16_to_2xhalf( -; CHECK: cvt.rn.f16.s16 -; CHECK: cvt.rn.f16.s16 -; CHECK: ret; define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi16_to_2xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rn.f16.s16 %rs3, %rs2; +; CHECK-NEXT: cvt.rn.f16.s16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = sitofp <2 x i16> %a to <2 x half> ret <2 x half> %r } -; CHECK-LABEL: test_uitofp_2xi16_to_2xhalf( -; CHECK: cvt.rn.f16.u16 -; CHECK: cvt.rn.f16.u16 -; CHECK: ret; define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi16_to_2xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.rn.f16.u16 %rs3, %rs2; +; CHECK-NEXT: cvt.rn.f16.u16 %rs4, %rs1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = uitofp <2 x i16> %a to <2 x half> ret <2 x half> %r } diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 988438bebea6d..388bd314801fc 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -1,262 +1,381 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; ## Support i16x2 instructions -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,I16x2 %s ; RUN: %if ptxas %{ \ -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_90 \ ; RUN: %} ; ## No support for i16x2 instructions -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,NO-I16x2 %s ; RUN: %if ptxas %{ \ -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ ; RUN: %} target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -; COMMON-LABEL: test_ret_const( -; COMMON: mov.b32 [[R:%r[0-9+]]], 131073; -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_ret_const() #0 { +; COMMON-LABEL: test_ret_const( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<2>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: mov.b32 %r1, 131073; +; COMMON-NEXT: st.param.b32 [func_retval0], %r1; +; COMMON-NEXT: ret; ret <2 x i16> } -; COMMON-LABEL: test_extract_0( -; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_extract_0_param_0]; -; COMMON: mov.b32 {[[RS:%rs[0-9]+]], tmp}, [[A]]; -; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON: ret; define i16 @test_extract_0(<2 x i16> %a) #0 { +; COMMON-LABEL: test_extract_0( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<2>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_extract_0_param_0]; +; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } +; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %e = extractelement <2 x i16> %a, i32 0 ret i16 %e } -; COMMON-LABEL: test_extract_1( -; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_extract_1_param_0]; -; COMMON: mov.b32 {tmp, [[RS:%rs[0-9]+]]}, [[A]]; -; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON: ret; define i16 @test_extract_1(<2 x i16> %a) #0 { +; COMMON-LABEL: test_extract_1( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<2>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_extract_1_param_0]; +; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } +; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %e = extractelement <2 x i16> %a, i32 1 ret i16 %e } -; COMMON-LABEL: test_extract_i( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_extract_i_param_0]; -; COMMON-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; -; COMMON-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; -; COMMON-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]]; -; COMMON: selp.b16 [[RS:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]]; -; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON: ret; define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { +; COMMON-LABEL: test_extract_i( +; COMMON: { +; COMMON-NEXT: .reg .pred %p<2>; +; COMMON-NEXT: .reg .b16 %rs<4>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-NEXT: .reg .b64 %rd<2>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_extract_i_param_0]; +; COMMON-NEXT: setp.eq.s64 %p1, %rd1, 0; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; +; COMMON-NEXT: cvt.u32.u16 %r2, %rs3; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %e = extractelement <2 x i16> %a, i64 %idx ret i16 %e } -; COMMON-LABEL: test_add( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_add_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_add_param_1]; -; -; I16x2-NEXT: add.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; -; -; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; NO-I16x2-DAG: add.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; NO-I16x2-DAG: add.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { +; I16x2-LABEL: test_add( +; I16x2: { +; I16x2-NEXT: .reg .b32 %r<4>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.u32 %r2, [test_add_param_1]; +; I16x2-NEXT: ld.param.u32 %r1, [test_add_param_0]; +; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; +; I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_add( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<7>; +; NO-I16x2-NEXT: .reg .b32 %r<4>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.u32 %r2, [test_add_param_1]; +; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: add.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: add.s16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; NO-I16x2-NEXT: ret; %r = add <2 x i16> %a, %b ret <2 x i16> %r } ; Check that we can lower add with immediate arguments. -; COMMON-LABEL: test_add_imm_0( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_add_imm_0_param_0]; -; -; I16x2: mov.b32 [[I:%r[0-9+]]], 131073; -; I16x2: add.s16x2 [[R:%r[0-9]+]], [[A]], [[I]]; -; -; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; NO-I16x2-DAG: add.s16 [[RS2:%rs[0-9]+]], [[RS0]], 1; -; NO-I16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; -; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { +; I16x2-LABEL: test_add_imm_0( +; I16x2: { +; I16x2-NEXT: .reg .b32 %r<4>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; +; I16x2-NEXT: mov.b32 %r2, 131073; +; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; +; I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_add_imm_0( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<5>; +; NO-I16x2-NEXT: .reg .b32 %r<3>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; +; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; +; NO-I16x2-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; NO-I16x2-NEXT: ret; %r = add <2 x i16> , %a ret <2 x i16> %r } -; COMMON-LABEL: test_add_imm_1( -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_add_imm_1_param_0]; -; -; I16x2: mov.b32 [[I:%r[0-9+]]], 131073; -; I16x2: add.s16x2 [[R:%r[0-9]+]], [[A]], [[I]]; -; -; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; NO-I16x2-DAG: add.s16 [[RS2:%rs[0-9]+]], [[RS0]], 1; -; NO-I16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; -; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { +; I16x2-LABEL: test_add_imm_1( +; I16x2: { +; I16x2-NEXT: .reg .b32 %r<4>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; +; I16x2-NEXT: mov.b32 %r2, 131073; +; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; +; I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_add_imm_1( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<5>; +; NO-I16x2-NEXT: .reg .b32 %r<3>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; +; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; +; NO-I16x2-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; NO-I16x2-NEXT: ret; %r = add <2 x i16> %a, ret <2 x i16> %r } -; COMMON-LABEL: test_sub( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sub_param_0]; -; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_sub_param_1]; -; -; COMMON-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; COMMON-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; COMMON-DAG: sub.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; COMMON-DAG: sub.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; COMMON-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { +; COMMON-LABEL: test_sub( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<7>; +; COMMON-NEXT: .reg .b32 %r<4>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r2, [test_sub_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_sub_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: sub.s16 %rs5, %rs4, %rs2; +; COMMON-NEXT: sub.s16 %rs6, %rs3, %rs1; +; COMMON-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %r = sub <2 x i16> %a, %b ret <2 x i16> %r } -; COMMON-LABEL: test_smax( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_smax_param_0]; -; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_smax_param_1]; -; I16x2: max.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; -; -; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; NO-I16x2-DAG: max.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; NO-I16x2-DAG: max.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { +; I16x2-LABEL: test_smax( +; I16x2: { +; I16x2-NEXT: .reg .b32 %r<4>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.u32 %r2, [test_smax_param_1]; +; I16x2-NEXT: ld.param.u32 %r1, [test_smax_param_0]; +; I16x2-NEXT: max.s16x2 %r3, %r1, %r2; +; I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_smax( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<7>; +; NO-I16x2-NEXT: .reg .b32 %r<4>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.u32 %r2, [test_smax_param_1]; +; NO-I16x2-NEXT: ld.param.u32 %r1, [test_smax_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: max.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: max.s16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; NO-I16x2-NEXT: ret; %cmp = icmp sgt <2 x i16> %a, %b %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; COMMON-LABEL: test_umax( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_umax_param_0]; -; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_umax_param_1]; -; I16x2: max.u16x2 [[R:%r[0-9]+]], [[A]], [[B]]; -; -; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; NO-I16x2-DAG: max.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; NO-I16x2-DAG: max.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { +; I16x2-LABEL: test_umax( +; I16x2: { +; I16x2-NEXT: .reg .b32 %r<4>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.u32 %r2, [test_umax_param_1]; +; I16x2-NEXT: ld.param.u32 %r1, [test_umax_param_0]; +; I16x2-NEXT: max.u16x2 %r3, %r1, %r2; +; I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_umax( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<7>; +; NO-I16x2-NEXT: .reg .b32 %r<4>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.u32 %r2, [test_umax_param_1]; +; NO-I16x2-NEXT: ld.param.u32 %r1, [test_umax_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: max.u16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: max.u16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; NO-I16x2-NEXT: ret; %cmp = icmp ugt <2 x i16> %a, %b %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; COMMON-LABEL: test_smin( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_smin_param_0]; -; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_smin_param_1]; -; I16x2: min.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; -; -; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; NO-I16x2-DAG: min.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; NO-I16x2-DAG: min.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { +; I16x2-LABEL: test_smin( +; I16x2: { +; I16x2-NEXT: .reg .b32 %r<4>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.u32 %r2, [test_smin_param_1]; +; I16x2-NEXT: ld.param.u32 %r1, [test_smin_param_0]; +; I16x2-NEXT: min.s16x2 %r3, %r1, %r2; +; I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_smin( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<7>; +; NO-I16x2-NEXT: .reg .b32 %r<4>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.u32 %r2, [test_smin_param_1]; +; NO-I16x2-NEXT: ld.param.u32 %r1, [test_smin_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: min.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: min.s16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; NO-I16x2-NEXT: ret; %cmp = icmp sle <2 x i16> %a, %b %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; COMMON-LABEL: test_umin( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_umin_param_0]; -; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_umin_param_1]; -; I16x2: min.u16x2 [[R:%r[0-9]+]], [[A]], [[B]]; -; -; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; NO-I16x2-DAG: min.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; NO-I16x2-DAG: min.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { +; I16x2-LABEL: test_umin( +; I16x2: { +; I16x2-NEXT: .reg .b32 %r<4>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.u32 %r2, [test_umin_param_1]; +; I16x2-NEXT: ld.param.u32 %r1, [test_umin_param_0]; +; I16x2-NEXT: min.u16x2 %r3, %r1, %r2; +; I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_umin( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<7>; +; NO-I16x2-NEXT: .reg .b32 %r<4>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.u32 %r2, [test_umin_param_1]; +; NO-I16x2-NEXT: ld.param.u32 %r1, [test_umin_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: min.u16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: min.u16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; +; NO-I16x2-NEXT: ret; %cmp = icmp ule <2 x i16> %a, %b %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; COMMON-LABEL: test_mul( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_mul_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_mul_param_1]; -; -; COMMON-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; COMMON-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; COMMON-DAG: mul.lo.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; COMMON-DAG: mul.lo.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; COMMON-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; -; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { +; COMMON-LABEL: test_mul( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<7>; +; COMMON-NEXT: .reg .b32 %r<4>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r2, [test_mul_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_mul_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: mul.lo.s16 %rs5, %rs4, %rs2; +; COMMON-NEXT: mul.lo.s16 %rs6, %rs3, %rs1; +; COMMON-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %r = mul <2 x i16> %a, %b ret <2 x i16> %r } ;; Logical ops are available on all GPUs as regular 32-bit logical ops -; COMMON-LABEL: test_or( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_or_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_or_param_1]; -; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], [[B]]; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 { +; COMMON-LABEL: test_or( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<7>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r3, [test_or_param_1]; +; COMMON-NEXT: ld.param.u32 %r4, [test_or_param_0]; +; COMMON-NEXT: or.b32 %r5, %r4, %r3; +; COMMON-NEXT: st.param.b32 [func_retval0], %r5; +; COMMON-NEXT: ret; %r = or <2 x i16> %a, %b ret <2 x i16> %r } ; Ops that operate on computed arguments go though a different lowering path. ; compared to the ones that operate on loaded data. So we test them separately. -; COMMON-LABEL: test_or_computed( -; COMMON: ld.param.u16 [[A:%rs[0-9+]]], [test_or_computed_param_0]; -; COMMON-DAG: mov.u16 [[C0:%rs[0-9]+]], 0; -; COMMON-DAG: mov.b32 [[R1:%r[0-9]+]], {[[A]], [[C0]]}; -; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5; -; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]}; -; COMMON: or.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; define <2 x i16> @test_or_computed(i16 %a) { +; COMMON-LABEL: test_or_computed( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<4>; +; COMMON-NEXT: .reg .b32 %r<4>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u16 %rs1, [test_or_computed_param_0]; +; COMMON-NEXT: mov.u16 %rs2, 0; +; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; COMMON-NEXT: mov.u16 %rs3, 5; +; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; +; COMMON-NEXT: or.b32 %r3, %r2, %r1; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 %r = or <2 x i16> %ins.1, %ins.0 @@ -264,46 +383,64 @@ define <2 x i16> @test_or_computed(i16 %a) { } ; Check that we can lower or with immediate arguments. -; COMMON-LABEL: test_or_imm_0( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_or_imm_0_param_0]; -; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 { +; COMMON-LABEL: test_or_imm_0( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0]; +; COMMON-NEXT: or.b32 %r2, %r1, 131073; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = or <2 x i16> , %a ret <2 x i16> %r } -; COMMON-LABEL: test_or_imm_1( -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_or_imm_1_param_0]; -; COMMON-NEXT: or.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 { +; COMMON-LABEL: test_or_imm_1( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0]; +; COMMON-NEXT: or.b32 %r2, %r1, 131073; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = or <2 x i16> %a, ret <2 x i16> %r } -; COMMON-LABEL: test_xor( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_xor_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_xor_param_1]; -; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], [[B]]; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 { +; COMMON-LABEL: test_xor( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<7>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r3, [test_xor_param_1]; +; COMMON-NEXT: ld.param.u32 %r4, [test_xor_param_0]; +; COMMON-NEXT: xor.b32 %r5, %r4, %r3; +; COMMON-NEXT: st.param.b32 [func_retval0], %r5; +; COMMON-NEXT: ret; %r = xor <2 x i16> %a, %b ret <2 x i16> %r } -; COMMON-LABEL: test_xor_computed( -; COMMON: ld.param.u16 [[A:%rs[0-9+]]], [test_xor_computed_param_0]; -; COMMON-DAG: mov.u16 [[C0:%rs[0-9]+]], 0; -; COMMON-DAG: mov.b32 [[R1:%r[0-9]+]], {[[A]], [[C0]]}; -; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5; -; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]}; -; COMMON: xor.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; define <2 x i16> @test_xor_computed(i16 %a) { +; COMMON-LABEL: test_xor_computed( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<4>; +; COMMON-NEXT: .reg .b32 %r<4>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u16 %rs1, [test_xor_computed_param_0]; +; COMMON-NEXT: mov.u16 %rs2, 0; +; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; COMMON-NEXT: mov.u16 %rs3, 5; +; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; +; COMMON-NEXT: xor.b32 %r3, %r2, %r1; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 %r = xor <2 x i16> %ins.1, %ins.0 @@ -311,48 +448,66 @@ define <2 x i16> @test_xor_computed(i16 %a) { } ; Check that we can lower xor with immediate arguments. -; COMMON-LABEL: test_xor_imm_0( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_xor_imm_0_param_0]; -; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 { +; COMMON-LABEL: test_xor_imm_0( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0]; +; COMMON-NEXT: xor.b32 %r2, %r1, 131073; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = xor <2 x i16> , %a ret <2 x i16> %r } -; COMMON-LABEL: test_xor_imm_1( -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_xor_imm_1_param_0]; -; COMMON-NEXT: xor.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 { +; COMMON-LABEL: test_xor_imm_1( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0]; +; COMMON-NEXT: xor.b32 %r2, %r1, 131073; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = xor <2 x i16> %a, ret <2 x i16> %r } -; COMMON-LABEL: test_and( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_and_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_and_param_1]; -; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], [[B]]; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 { +; COMMON-LABEL: test_and( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<7>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r3, [test_and_param_1]; +; COMMON-NEXT: ld.param.u32 %r4, [test_and_param_0]; +; COMMON-NEXT: and.b32 %r5, %r4, %r3; +; COMMON-NEXT: st.param.b32 [func_retval0], %r5; +; COMMON-NEXT: ret; %r = and <2 x i16> %a, %b ret <2 x i16> %r } ; Ops that operate on computed arguments go though a different lowering path. ; compared to the ones that operate on loaded data. So we test them separately. -; COMMON-LABEL: test_and_computed( -; COMMON: ld.param.u16 [[A:%rs[0-9+]]], [test_and_computed_param_0]; -; COMMON-DAG: mov.u16 [[C0:%rs[0-9]+]], 0; -; COMMON-DAG: mov.b32 [[R1:%r[0-9]+]], {[[A]], [[C0]]}; -; COMMON-DAG: mov.u16 [[C5:%rs[0-9]+]], 5; -; COMMON-DAG: mov.b32 [[R2:%r[0-9]+]], {[[A]], [[C5]]}; -; COMMON: and.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; define <2 x i16> @test_and_computed(i16 %a) { +; COMMON-LABEL: test_and_computed( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<4>; +; COMMON-NEXT: .reg .b32 %r<4>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u16 %rs1, [test_and_computed_param_0]; +; COMMON-NEXT: mov.u16 %rs2, 0; +; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; COMMON-NEXT: mov.u16 %rs3, 5; +; COMMON-NEXT: mov.b32 %r2, {%rs1, %rs3}; +; COMMON-NEXT: and.b32 %r3, %r2, %r1; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 %r = and <2 x i16> %ins.1, %ins.0 @@ -360,74 +515,102 @@ define <2 x i16> @test_and_computed(i16 %a) { } ; Check that we can lower and with immediate arguments. -; COMMON-LABEL: test_and_imm_0( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_and_imm_0_param_0]; -; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 { +; COMMON-LABEL: test_and_imm_0( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0]; +; COMMON-NEXT: and.b32 %r2, %r1, 131073; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = and <2 x i16> , %a ret <2 x i16> %r } -; COMMON-LABEL: test_and_imm_1( -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_and_imm_1_param_0]; -; COMMON-NEXT: and.b32 [[R:%r[0-9]+]], [[A]], 131073; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 { +; COMMON-LABEL: test_and_imm_1( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0]; +; COMMON-NEXT: and.b32 %r2, %r1, 131073; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = and <2 x i16> %a, ret <2 x i16> %r } -; COMMON-LABEL: .func test_ldst_v2i16( -; COMMON-DAG: ld.param.u64 [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0]; -; COMMON-DAG: ld.param.u64 [[B:%rd[0-9]+]], [test_ldst_v2i16_param_1]; -; COMMON-DAG: ld.u32 [[E:%r[0-9]+]], [[[A]]]; -; COMMON-DAG: st.u32 [[[B]]], [[E]]; -; COMMON: ret; define void @test_ldst_v2i16(ptr %a, ptr %b) { +; COMMON-LABEL: test_ldst_v2i16( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<2>; +; COMMON-NEXT: .reg .b64 %rd<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v2i16_param_1]; +; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v2i16_param_0]; +; COMMON-NEXT: ld.u32 %r1, [%rd1]; +; COMMON-NEXT: st.u32 [%rd2], %r1; +; COMMON-NEXT: ret; %t1 = load <2 x i16>, ptr %a store <2 x i16> %t1, ptr %b, align 16 ret void } -; COMMON-LABEL: .func test_ldst_v3i16( -; COMMON-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3i16_param_0]; -; COMMON-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3i16_param_1]; ; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair ; number of bitshifting instructions that may change at llvm's whim. ; So we only verify that we only issue correct number of writes using ; correct offset, but not the values we write. -; COMMON-DAG: ld.u64 -; COMMON-DAG: st.u32 [%[[B]]], -; COMMON-DAG: st.u16 [%[[B]]+4], -; COMMON: ret; define void @test_ldst_v3i16(ptr %a, ptr %b) { +; COMMON-LABEL: test_ldst_v3i16( +; COMMON: { +; COMMON-NEXT: .reg .b64 %rd<5>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v3i16_param_1]; +; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v3i16_param_0]; +; COMMON-NEXT: ld.u64 %rd3, [%rd1]; +; COMMON-NEXT: shr.u64 %rd4, %rd3, 32; +; COMMON-NEXT: st.u32 [%rd2], %rd3; +; COMMON-NEXT: st.u16 [%rd2+4], %rd4; +; COMMON-NEXT: ret; %t1 = load <3 x i16>, ptr %a store <3 x i16> %t1, ptr %b, align 16 ret void } -; COMMON-LABEL: .func test_ldst_v4i16( -; COMMON-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4i16_param_0]; -; COMMON-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4i16_param_1]; -; COMMON-DAG: ld.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [%[[A]]]; -; COMMON-DAG: st.v4.u16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; COMMON: ret; define void @test_ldst_v4i16(ptr %a, ptr %b) { +; COMMON-LABEL: test_ldst_v4i16( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<5>; +; COMMON-NEXT: .reg .b64 %rd<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v4i16_param_1]; +; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v4i16_param_0]; +; COMMON-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; COMMON-NEXT: st.v4.u16 [%rd2], {%rs1, %rs2, %rs3, %rs4}; +; COMMON-NEXT: ret; %t1 = load <4 x i16>, ptr %a store <4 x i16> %t1, ptr %b, align 16 ret void } -; COMMON-LABEL: .func test_ldst_v8i16( -; COMMON-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8i16_param_0]; -; COMMON-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8i16_param_1]; -; COMMON-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; -; COMMON-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; COMMON: ret; define void @test_ldst_v8i16(ptr %a, ptr %b) { +; COMMON-LABEL: test_ldst_v8i16( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-NEXT: .reg .b64 %rd<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v8i16_param_1]; +; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v8i16_param_0]; +; COMMON-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; COMMON-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; COMMON-NEXT: ret; %t1 = load <8 x i16>, ptr %a store <8 x i16> %t1, ptr %b, align 16 ret void @@ -435,139 +618,185 @@ define void @test_ldst_v8i16(ptr %a, ptr %b) { declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0 -; COMMON-LABEL: test_call( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_call_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_call_param_1]; -; COMMON: { -; COMMON-DAG: .param .align 4 .b8 param0[4]; -; COMMON-DAG: .param .align 4 .b8 param1[4]; -; COMMON-DAG: st.param.b32 [param0], [[A]]; -; COMMON-DAG: st.param.b32 [param1], [[B]]; -; COMMON-DAG: .param .align 4 .b8 retval0[4]; -; COMMON: call.uni (retval0), -; COMMON-NEXT: test_callee, -; COMMON: ); -; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; COMMON-NEXT: } -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { +; COMMON-LABEL: test_call( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r2, [test_call_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_call_param_0]; +; COMMON-NEXT: { // callseq 0, 0 +; COMMON-NEXT: .param .align 4 .b8 param0[4]; +; COMMON-NEXT: st.param.b32 [param0], %r1; +; COMMON-NEXT: .param .align 4 .b8 param1[4]; +; COMMON-NEXT: st.param.b32 [param1], %r2; +; COMMON-NEXT: .param .align 4 .b8 retval0[4]; +; COMMON-NEXT: call.uni (retval0), +; COMMON-NEXT: test_callee, +; COMMON-NEXT: ( +; COMMON-NEXT: param0, +; COMMON-NEXT: param1 +; COMMON-NEXT: ); +; COMMON-NEXT: ld.param.b32 %r3, [retval0]; +; COMMON-NEXT: } // callseq 0 +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %r = call <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) ret <2 x i16> %r } -; COMMON-LABEL: test_call_flipped( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_call_flipped_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_call_flipped_param_1]; -; COMMON: { -; COMMON-DAG: .param .align 4 .b8 param0[4]; -; COMMON-DAG: .param .align 4 .b8 param1[4]; -; COMMON-DAG: st.param.b32 [param0], [[B]]; -; COMMON-DAG: st.param.b32 [param1], [[A]]; -; COMMON-DAG: .param .align 4 .b8 retval0[4]; -; COMMON: call.uni (retval0), -; COMMON-NEXT: test_callee, -; COMMON: ); -; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; COMMON-NEXT: } -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { +; COMMON-LABEL: test_call_flipped( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0]; +; COMMON-NEXT: { // callseq 1, 0 +; COMMON-NEXT: .param .align 4 .b8 param0[4]; +; COMMON-NEXT: st.param.b32 [param0], %r2; +; COMMON-NEXT: .param .align 4 .b8 param1[4]; +; COMMON-NEXT: st.param.b32 [param1], %r1; +; COMMON-NEXT: .param .align 4 .b8 retval0[4]; +; COMMON-NEXT: call.uni (retval0), +; COMMON-NEXT: test_callee, +; COMMON-NEXT: ( +; COMMON-NEXT: param0, +; COMMON-NEXT: param1 +; COMMON-NEXT: ); +; COMMON-NEXT: ld.param.b32 %r3, [retval0]; +; COMMON-NEXT: } // callseq 1 +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %r = call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) ret <2 x i16> %r } -; COMMON-LABEL: test_tailcall_flipped( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_tailcall_flipped_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_tailcall_flipped_param_1]; -; COMMON: { -; COMMON-DAG: .param .align 4 .b8 param0[4]; -; COMMON-DAG: .param .align 4 .b8 param1[4]; -; COMMON-DAG: st.param.b32 [param0], [[B]]; -; COMMON-DAG: st.param.b32 [param1], [[A]]; -; COMMON-DAG: .param .align 4 .b8 retval0[4]; -; COMMON: call.uni (retval0), -; COMMON-NEXT: test_callee, -; COMMON: ); -; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; COMMON-NEXT: } -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { +; COMMON-LABEL: test_tailcall_flipped( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0]; +; COMMON-NEXT: { // callseq 2, 0 +; COMMON-NEXT: .param .align 4 .b8 param0[4]; +; COMMON-NEXT: st.param.b32 [param0], %r2; +; COMMON-NEXT: .param .align 4 .b8 param1[4]; +; COMMON-NEXT: st.param.b32 [param1], %r1; +; COMMON-NEXT: .param .align 4 .b8 retval0[4]; +; COMMON-NEXT: call.uni (retval0), +; COMMON-NEXT: test_callee, +; COMMON-NEXT: ( +; COMMON-NEXT: param0, +; COMMON-NEXT: param1 +; COMMON-NEXT: ); +; COMMON-NEXT: ld.param.b32 %r3, [retval0]; +; COMMON-NEXT: } // callseq 2 +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %r = tail call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) ret <2 x i16> %r } -; COMMON-LABEL: test_select( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_param_1]; -; COMMON-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] -; COMMON-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; -; COMMON-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 { +; COMMON-LABEL: test_select( +; COMMON: { +; COMMON-NEXT: .reg .pred %p<2>; +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<4>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u8 %rs1, [test_select_param_2]; +; COMMON-NEXT: and.b16 %rs2, %rs1, 1; +; COMMON-NEXT: setp.eq.b16 %p1, %rs2, 1; +; COMMON-NEXT: ld.param.u32 %r2, [test_select_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_select_param_0]; +; COMMON-NEXT: selp.b32 %r3, %r1, %r2, %p1; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %r = select i1 %c, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; COMMON-LABEL: test_select_cc( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_cc_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_cc_param_1]; -; COMMON-DAG: ld.param.u32 [[C:%r[0-9]+]], [test_select_cc_param_2]; -; COMMON-DAG: ld.param.u32 [[D:%r[0-9]+]], [test_select_cc_param_3]; -; COMMON-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; COMMON-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] -; COMMON-DAG: setp.ne.s16 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; COMMON-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; COMMON-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; COMMON-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; COMMON-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; -; COMMON-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; -; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) #0 { +; COMMON-LABEL: test_select_cc( +; COMMON: { +; COMMON-NEXT: .reg .pred %p<3>; +; COMMON-NEXT: .reg .b16 %rs<11>; +; COMMON-NEXT: .reg .b32 %r<6>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; +; COMMON-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; +; COMMON-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1; +; COMMON-NEXT: setp.ne.s16 %p2, %rs4, %rs2; +; COMMON-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; COMMON-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; COMMON-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; +; COMMON-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; +; COMMON-NEXT: mov.b32 %r5, {%rs10, %rs9}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r5; +; COMMON-NEXT: ret; %cc = icmp ne <2 x i16> %c, %d %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; COMMON-LABEL: test_select_cc_i32_i16( -; COMMON-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_0]; -; COMMON-DAG: ld.param.v2.u32 {[[B0:%r[0-9]+]], [[B1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_1]; -; COMMON-DAG: ld.param.u32 [[C:%r[0-9]+]], [test_select_cc_i32_i16_param_2]; -; COMMON-DAG: ld.param.u32 [[D:%r[0-9]+]], [test_select_cc_i32_i16_param_3]; -; COMMON-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; COMMON-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] -; COMMON-DAG: setp.ne.s16 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; COMMON-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; COMMON-DAG: selp.b32 [[R0:%r[0-9]+]], [[A0]], [[B0]], [[P0]]; -; COMMON-DAG: selp.b32 [[R1:%r[0-9]+]], [[A1]], [[B1]], [[P1]]; -; COMMON-NEXT: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]}; -; COMMON-NEXT: ret; define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, +; COMMON-LABEL: test_select_cc_i32_i16( +; COMMON: { +; COMMON-NEXT: .reg .pred %p<3>; +; COMMON-NEXT: .reg .b16 %rs<5>; +; COMMON-NEXT: .reg .b32 %r<9>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i32_i16_param_1]; +; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_select_cc_i32_i16_param_0]; +; COMMON-NEXT: ld.param.u32 %r6, [test_select_cc_i32_i16_param_3]; +; COMMON-NEXT: ld.param.u32 %r5, [test_select_cc_i32_i16_param_2]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r6; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r5; +; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1; +; COMMON-NEXT: setp.ne.s16 %p2, %rs4, %rs2; +; COMMON-NEXT: selp.b32 %r7, %r2, %r4, %p2; +; COMMON-NEXT: selp.b32 %r8, %r1, %r3, %p1; +; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; +; COMMON-NEXT: ret; <2 x i16> %c, <2 x i16> %d) #0 { %cc = icmp ne <2 x i16> %c, %d %r = select <2 x i1> %cc, <2 x i32> %a, <2 x i32> %b ret <2 x i32> %r } -; COMMON-LABEL: test_select_cc_i16_i32( -; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_cc_i16_i32_param_0]; -; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_cc_i16_i32_param_1]; -; COMMON-DAG: ld.param.v2.u32 {[[C0:%r[0-9]+]], [[C1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_2]; -; COMMON-DAG: ld.param.v2.u32 {[[D0:%r[0-9]+]], [[D1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_3]; -; COMMON-DAG: setp.ne.s32 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; COMMON-DAG: setp.ne.s32 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; COMMON-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; COMMON-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; COMMON-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; -; COMMON-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; -; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; COMMON-NEXT: st.param.b32 [func_retval0], [[R]]; -; COMMON-NEXT: ret; define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, +; COMMON-LABEL: test_select_cc_i16_i32( +; COMMON: { +; COMMON-NEXT: .reg .pred %p<3>; +; COMMON-NEXT: .reg .b16 %rs<7>; +; COMMON-NEXT: .reg .b32 %r<8>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.v2.u32 {%r5, %r6}, [test_select_cc_i16_i32_param_3]; +; COMMON-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i16_i32_param_2]; +; COMMON-NEXT: ld.param.u32 %r2, [test_select_cc_i16_i32_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_select_cc_i16_i32_param_0]; +; COMMON-NEXT: setp.ne.s32 %p1, %r3, %r5; +; COMMON-NEXT: setp.ne.s32 %p2, %r4, %r6; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; COMMON-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; +; COMMON-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r7; +; COMMON-NEXT: ret; <2 x i32> %c, <2 x i32> %d) #0 { %cc = icmp ne <2 x i32> %c, %d %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b @@ -575,79 +804,114 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, } -; COMMON-LABEL: test_trunc_2xi32( -; COMMON: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_trunc_2xi32_param_0]; -; COMMON-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A0]]; -; COMMON-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[A1]]; -; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON: ret; define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { +; COMMON-LABEL: test_trunc_2xi32( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<4>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0]; +; COMMON-NEXT: cvt.u16.u32 %rs1, %r2; +; COMMON-NEXT: cvt.u16.u32 %rs2, %r1; +; COMMON-NEXT: mov.b32 %r3, {%rs2, %rs1}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r3; +; COMMON-NEXT: ret; %r = trunc <2 x i32> %a to <2 x i16> ret <2 x i16> %r } -; COMMON-LABEL: test_trunc_2xi64( -; COMMON: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_trunc_2xi64_param_0]; -; COMMON-DAG: cvt.u16.u64 [[R0:%rs[0-9]+]], [[A0]]; -; COMMON-DAG: cvt.u16.u64 [[R1:%rs[0-9]+]], [[A1]]; -; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON: ret; define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 { +; COMMON-LABEL: test_trunc_2xi64( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<2>; +; COMMON-NEXT: .reg .b64 %rd<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; +; COMMON-NEXT: cvt.u16.u64 %rs1, %rd2; +; COMMON-NEXT: cvt.u16.u64 %rs2, %rd1; +; COMMON-NEXT: mov.b32 %r1, {%rs2, %rs1}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r1; +; COMMON-NEXT: ret; %r = trunc <2 x i64> %a to <2 x i16> ret <2 x i16> %r } -; COMMON-LABEL: test_zext_2xi32( -; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_zext_2xi32_param_0]; -; COMMON: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; COMMON-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[A0]]; -; COMMON-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[A1]]; -; COMMON-NEXT: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]}; -; COMMON: ret; define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { +; COMMON-LABEL: test_zext_2xi32( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<4>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; +; COMMON-NEXT: cvt.u32.u16 %r3, %rs2; +; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3}; +; COMMON-NEXT: ret; %r = zext <2 x i16> %a to <2 x i32> ret <2 x i32> %r } -; COMMON-LABEL: test_zext_2xi64( -; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_zext_2xi64_param_0]; -; COMMON: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; COMMON-DAG: cvt.u64.u16 [[R0:%rd[0-9]+]], [[A0]]; -; COMMON-DAG: cvt.u64.u16 [[R1:%rd[0-9]+]], [[A1]]; -; COMMON-NEXT: st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]}; -; COMMON: ret; define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { +; COMMON-LABEL: test_zext_2xi64( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<2>; +; COMMON-NEXT: .reg .b64 %rd<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: cvt.u64.u16 %rd1, %rs2; +; COMMON-NEXT: cvt.u64.u16 %rd2, %rs1; +; COMMON-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; COMMON-NEXT: ret; %r = zext <2 x i16> %a to <2 x i64> ret <2 x i64> %r } -; COMMON-LABEL: test_bitcast_i32_to_2xi16( -; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_i32_to_2xi16_param_0]; -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON: ret; define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { +; COMMON-LABEL: test_bitcast_i32_to_2xi16( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0]; +; COMMON-NEXT: st.param.b32 [func_retval0], %r1; +; COMMON-NEXT: ret; %r = bitcast i32 %a to <2 x i16> ret <2 x i16> %r } -; COMMON-LABEL: test_bitcast_2xi16_to_i32( -; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xi16_to_i32_param_0]; -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON: ret; define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 { +; COMMON-LABEL: test_bitcast_2xi16_to_i32( +; COMMON: { +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r2, [test_bitcast_2xi16_to_i32_param_0]; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = bitcast <2 x i16> %a to i32 ret i32 %r } -; COMMON-LABEL: test_bitcast_2xi16_to_2xhalf( -; COMMON: ld.param.u16 [[RS1:%rs[0-9]+]], [test_bitcast_2xi16_to_2xhalf_param_0]; -; COMMON: mov.u16 [[RS2:%rs[0-9]+]], 5; -; COMMON: mov.b32 [[R:%r[0-9]+]], {[[RS1]], [[RS2]]}; -; COMMON: st.param.b32 [func_retval0], [[R]]; -; COMMON: ret; define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 { +; COMMON-LABEL: test_bitcast_2xi16_to_2xhalf( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0]; +; COMMON-NEXT: mov.u16 %rs2, 5; +; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r1; +; COMMON-NEXT: ret; %ins.0 = insertelement <2 x i16> undef, i16 %a, i32 0 %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 %r = bitcast <2 x i16> %ins.1 to <2 x half> @@ -655,43 +919,71 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 { } -; COMMON-LABEL: test_shufflevector( -; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_shufflevector_param_0]; -; COMMON: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[R]]; -; COMMON: mov.b32 [[R1:%r[0-9]+]], {[[RS1]], [[RS0]]}; -; COMMON: st.param.b32 [func_retval0], [[R1]]; -; COMMON: ret; define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { +; COMMON-LABEL: test_shufflevector( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> ret <2 x i16> %s } -; COMMON-LABEL: test_insertelement( -; COMMON: ld.param.u16 [[B:%rs[0-9]+]], [test_insertelement_param_1]; -; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_insertelement_param_0]; -; COMMON: { .reg .b16 tmp; mov.b32 {[[R0:%rs[0-9]+]], tmp}, [[A]]; } -; COMMON: mov.b32 [[R1:%r[0-9]+]], {[[R0]], [[B]]}; -; COMMON: st.param.b32 [func_retval0], [[R1]]; -; COMMON: ret; define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { +; COMMON-LABEL: test_insertelement( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.u16 %rs1, [test_insertelement_param_1]; +; COMMON-NEXT: ld.param.u32 %r1, [test_insertelement_param_0]; +; COMMON-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } +; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %i = insertelement <2 x i16> %a, i16 %x, i64 1 ret <2 x i16> %i } -; COMMON-LABEL: test_fptosi_2xhalf_to_2xi16( -; COMMON: cvt.rzi.s16.f16 -; COMMON: cvt.rzi.s16.f16 -; COMMON: ret; define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 { +; COMMON-LABEL: test_fptosi_2xhalf_to_2xi16( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<5>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; +; COMMON-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; +; COMMON-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = fptosi <2 x half> %a to <2 x i16> ret <2 x i16> %r } -; COMMON-LABEL: test_fptoui_2xhalf_to_2xi16( -; COMMON: cvt.rzi.u16.f16 -; COMMON: cvt.rzi.u16.f16 -; COMMON: ret; define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 { +; COMMON-LABEL: test_fptoui_2xhalf_to_2xi16( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<5>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; +; COMMON-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; +; COMMON-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %r = fptoui <2 x half> %a to <2 x i16> ret <2 x i16> %r } diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll index df9c3e59b0e6b..e9662dd8a7fa3 100644 --- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck %s ; RUN: %if ptxas %{ \ @@ -9,25 +10,37 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -; CHECK-LABEL: test_bitcast_2xi8_i16( -; CHECK: ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0]; -; CHECK: mov.b32 {%rs1, %rs2}, %r1; -; CHECK: shl.b16 %rs3, %rs2, 8; -; CHECK: and.b16 %rs4, %rs1, 255; -; CHECK: or.b16 %rs5, %rs4, %rs3; -; CHECK: cvt.u32.u16 %r2, %rs5; -; CHECK: st.param.b32 [func_retval0], %r2; define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) { +; CHECK-LABEL: test_bitcast_2xi8_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<6>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: shl.b16 %rs3, %rs2, 8; +; CHECK-NEXT: and.b16 %rs4, %rs1, 255; +; CHECK-NEXT: or.b16 %rs5, %rs4, %rs3; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %res = bitcast <2 x i8> %a to i16 ret i16 %res } -; CHECK-LABEL: test_bitcast_i16_2xi8( -; CHECK: ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0]; -; CHECK: shr.u16 %rs2, %rs1, 8; -; CHECK: mov.b32 %r1, {%rs1, %rs2}; -; CHECK: st.param.b32 [func_retval0], %r1; define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) { +; CHECK-LABEL: test_bitcast_i16_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0]; +; CHECK-NEXT: shr.u16 %rs2, %rs1, 8; +; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %res = bitcast i16 %a to <2 x i8> ret <2 x i8> %res } From a6fc489bb7a2e9fb3a7f70cccc181e4ee70374bf Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 18 Nov 2024 10:41:14 -0800 Subject: [PATCH 017/366] AMDGPU: Add gfx950 subtarget definitions (#116307) Mostly a stub, but adds some baseline tests and tests for removed instructions. --- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/Basic/Cuda.h | 1 + clang/lib/Basic/Cuda.cpp | 1 + clang/lib/Basic/Targets/NVPTX.cpp | 1 + clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 1 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 2 + clang/test/Driver/amdgpu-macros.cl | 1 + clang/test/Driver/amdgpu-mcpu.cl | 2 + .../Misc/target-invalid-cpu-note/amdgcn.c | 1 + .../test/Misc/target-invalid-cpu-note/nvptx.c | 1 + llvm/docs/AMDGPUUsage.rst | 9 +- llvm/include/llvm/BinaryFormat/ELF.h | 2 +- llvm/include/llvm/TargetParser/TargetParser.h | 25 +- llvm/lib/Object/ELFObjectFile.cpp | 2 + llvm/lib/ObjectYAML/ELFYAML.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPU.td | 16 + llvm/lib/Target/AMDGPU/GCNProcessors.td | 4 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 + .../MCTargetDesc/AMDGPUTargetStreamer.cpp | 2 + llvm/lib/TargetParser/TargetParser.cpp | 11 +- llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 345 ++-- .../CodeGen/AMDGPU/directive-amdgcn-target.ll | 6 + .../CodeGen/AMDGPU/elf-header-flags-mach.ll | 2 + .../AMDGPU/elf-header-flags-sramecc.ll | 8 + llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 594 +++++-- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 594 +++++-- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 2 + llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 1224 ++++++------- llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 1113 ++++++------ llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 1569 ++++++++--------- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 1223 ++++++------- llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 1113 ++++++------ llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 1569 ++++++++--------- llvm/test/MC/AMDGPU/flat-scratch-gfx940.s | 1 + llvm/test/MC/AMDGPU/gfx940_asm_features.s | 1 + llvm/test/MC/AMDGPU/gfx950-unsupported.s | 179 ++ .../MC/AMDGPU/gfx950_invalid_encoding.txt | 13 + .../Disassembler/AMDGPU/gfx940_features.txt | 1 + .../Object/AMDGPU/elf-header-flags-mach.yaml | 7 + .../llvm-objdump/ELF/AMDGPU/subtarget.ll | 6 +- .../llvm-readobj/ELF/AMDGPU/elf-headers.test | 9 + llvm/tools/llvm-readobj/ELFDumper.cpp | 1 + offload/DeviceRTL/CMakeLists.txt | 2 +- 43 files changed, 5148 insertions(+), 4520 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx950-unsupported.s create mode 100644 llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 2bd67138ecc04..0efe62f1804cd 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -712,6 +712,8 @@ Target Specific Changes AMDGPU Support ^^^^^^^^^^^^^^ +- Initial support for gfx950 + - Added headers ``gpuintrin.h`` and ``amdgpuintrin.h`` that contains common definitions for GPU builtin functions. This header can be included for OpenMP, CUDA, HIP, OpenCL, and C/C++. diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 721e8981af6ff..c2a4addf488df 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -107,6 +107,7 @@ enum class OffloadArch { GFX940, GFX941, GFX942, + GFX950, GFX10_1_GENERIC, GFX1010, GFX1011, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 59c932468cd89..d56609a2a8f24 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -125,6 +125,7 @@ static const OffloadArchToStringMap arch_names[] = { GFX(940), // gfx940 GFX(941), // gfx941 GFX(942), // gfx942 + GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, GFX(1010), // gfx1010 GFX(1011), // gfx1011 diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 0897032c4b854..dbc3fec365761 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -209,6 +209,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX940: case OffloadArch::GFX941: case OffloadArch::GFX942: + case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: case OffloadArch::GFX1010: case OffloadArch::GFX1011: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 73e3f9e256f0d..756f0482b8ea7 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2304,6 +2304,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX940: case OffloadArch::GFX941: case OffloadArch::GFX942: + case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: case OffloadArch::GFX1010: case OffloadArch::GFX1011: diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 8b56ec94f2c4e..5c324032b5195 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -32,6 +32,7 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s @@ -88,6 +89,7 @@ // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl index d354f933c5ad7..d97b2ddb1fc66 100644 --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -110,6 +110,7 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx940 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx941 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx942 -DFAMILY=GFX9 +// RUN: %clang -E -dM -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx950 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010 -DFAMILY=GFX10 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1011 -DFAMILY=GFX10 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1012 -DFAMILY=GFX10 diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl index ba57843507298..7c34d3ec6c63a 100644 --- a/clang/test/Driver/amdgpu-mcpu.cl +++ b/clang/test/Driver/amdgpu-mcpu.cl @@ -95,6 +95,7 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940 %s // RUN: %clang -### -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefix=GFX941 %s // RUN: %clang -### -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefix=GFX942 %s +// RUN: %clang -### -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefix=GFX950 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefix=GFX1011 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefix=GFX1012 %s @@ -150,6 +151,7 @@ // GFX940: "-target-cpu" "gfx940" // GFX941: "-target-cpu" "gfx941" // GFX942: "-target-cpu" "gfx942" +// GFX950: "-target-cpu" "gfx950" // GFX1010: "-target-cpu" "gfx1010" // GFX1011: "-target-cpu" "gfx1011" // GFX1012: "-target-cpu" "gfx1012" diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c index 4e675871f1e5b..642d2df211c21 100644 --- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c +++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c @@ -48,6 +48,7 @@ // CHECK-SAME: {{^}}, gfx940 // CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 +// CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx1010 // CHECK-SAME: {{^}}, gfx1011 // CHECK-SAME: {{^}}, gfx1012 diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c index 44fe07065b242..3ea6c02d6b384 100644 --- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c +++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c @@ -54,6 +54,7 @@ // CHECK-SAME: {{^}}, gfx940 // CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 +// CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx10-1-generic // CHECK-SAME: {{^}}, gfx1010 // CHECK-SAME: {{^}}, gfx1011 diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index c180ca5fcebef..b85b680b9c82d 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -399,6 +399,13 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following work-item IDs + ``gfx950`` ``amdgcn`` dGPU - sramecc - Architected *TBA* + - tgsplit flat + - xnack scratch .. TODO:: + - kernarg preload - Packed + work-item Add product + IDs names. + **GCN GFX10.1 (RDNA 1)** [AMD-GCN-GFX10-RDNA1]_ ----------------------------------------------------------------------------------------------------------------------- ``gfx1010`` ``amdgcn`` dGPU - cumode - Absolute - *rocm-amdhsa* - Radeon RX 5700 @@ -2178,7 +2185,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX942`` 0x04c ``gfx942`` *reserved* 0x04d Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1201`` 0x04e ``gfx1201`` - *reserved* 0x04f Reserved. + ``EF_AMDGPU_MACH_AMDGCN_GFX950`` 0x04f ``gfx950`` *reserved* 0x050 Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC`` 0x051 ``gfx9-generic`` ``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC`` 0x052 ``gfx10-1-generic`` diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 6c05ea7208e1f..fd32a6ec19652 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -811,7 +811,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, - EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f, + EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050, EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051, EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index c6db4dfd7f515..55e7b417428c4 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -86,18 +86,19 @@ enum GPUKind : uint32_t { GK_GFX940 = 68, GK_GFX941 = 69, GK_GFX942 = 70, - - GK_GFX1010 = 71, - GK_GFX1011 = 72, - GK_GFX1012 = 73, - GK_GFX1013 = 74, - GK_GFX1030 = 75, - GK_GFX1031 = 76, - GK_GFX1032 = 77, - GK_GFX1033 = 78, - GK_GFX1034 = 79, - GK_GFX1035 = 80, - GK_GFX1036 = 81, + GK_GFX950 = 71, + + GK_GFX1010 = 72, + GK_GFX1011 = 73, + GK_GFX1012 = 74, + GK_GFX1013 = 75, + GK_GFX1030 = 76, + GK_GFX1031 = 77, + GK_GFX1032 = 78, + GK_GFX1033 = 79, + GK_GFX1034 = 80, + GK_GFX1035 = 81, + GK_GFX1036 = 82, GK_GFX1100 = 90, GK_GFX1101 = 91, diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 9dc39936ffd8b..2ffb2ac5e7e45 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -550,6 +550,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { return "gfx941"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: return "gfx942"; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: + return "gfx950"; // AMDGCN GFX10. case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 130b8798ab4a4..ca0ea03452d3b 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -609,6 +609,7 @@ void ScalarBitSetTraits::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d7feaef8c4a97..d028c1f5ca761 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -360,6 +360,12 @@ def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts", "Additional instructions for GFX940+" >; +def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", + "GFX950Insts", + "true", + "Additional instructions for GFX950+" +>; + def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", "GFX10Insts", "true", @@ -1470,6 +1476,14 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureFlatBufferGlobalAtomicFaddF64Inst ]>; +def FeatureISAVersion9_5_Common : FeatureSet< + !listconcat(FeatureISAVersion9_4_Common.Features, + [FeatureFP8Insts, + FeatureFP8ConversionInsts, + FeatureCvtFP8VOP1Bug, + FeatureGFX950Insts + ])>; + def FeatureISAVersion9_4_0 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [ @@ -1503,6 +1517,8 @@ def FeatureISAVersion9_4_Generic : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [FeatureRequiresCOV6])>; +def FeatureISAVersion9_5_0 : FeatureSet; + def FeatureISAVersion10_Common : FeatureSet< [FeatureGFX10, FeatureLDSBankCount32, diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 067043d290b76..3403cbab526d4 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -204,6 +204,10 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel, FeatureISAVersion9_4_2.Features >; +def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel, + FeatureISAVersion9_5_0.Features +>; + // [gfx900, gfx902, gfx904, gfx906, gfx909, gfx90c] def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel, FeatureISAVersion9_Generic.Features diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6ff964077d8fd..1b06756a8a101 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -106,6 +106,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool GFX9Insts = false; bool GFX90AInsts = false; bool GFX940Insts = false; + bool GFX950Insts = false; bool GFX10Insts = false; bool GFX11Insts = false; bool GFX12Insts = false; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 55ba5ebbebb8f..ffde4d33f1341 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -96,6 +96,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: AK = GK_GFX950; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; @@ -182,6 +183,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940; case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941; case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942; + case GK_GFX950: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 7dfb8c021a8a5..b0385915f3042 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -107,6 +107,7 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx940"}, {"gfx940"}, GK_GFX940, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx941"}, {"gfx941"}, GK_GFX941, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx942"}, {"gfx942"}, GK_GFX942, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, + {{"gfx950"}, {"gfx950"}, GK_GFX950, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, {{"gfx1011"}, {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, {{"gfx1012"}, {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, @@ -262,6 +263,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { case GK_GFX940: return {9, 4, 0}; case GK_GFX941: return {9, 4, 1}; case GK_GFX942: return {9, 4, 2}; + case GK_GFX950: return {9, 5, 0}; case GK_GFX1010: return {10, 1, 0}; case GK_GFX1011: return {10, 1, 1}; case GK_GFX1012: return {10, 1, 2}; @@ -361,7 +363,8 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["wavefrontsize32"] = true; Features["wavefrontsize64"] = true; } else if (T.isAMDGCN()) { - switch (parseArchAMDGCN(GPU)) { + AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU); + switch (Kind) { case GK_GFX1201: case GK_GFX1200: case GK_GFX12_GENERIC: @@ -466,12 +469,16 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["s-memtime-inst"] = true; Features["gws"] = true; break; + case GK_GFX950: + Features["gfx950-insts"] = true; + [[fallthrough]]; case GK_GFX942: case GK_GFX941: case GK_GFX940: Features["fp8-insts"] = true; Features["fp8-conversion-insts"] = true; - Features["xf32-insts"] = true; + if (Kind != GK_GFX950) + Features["xf32-insts"] = true; [[fallthrough]]; case GK_GFX9_4_GENERIC: Features["gfx940-insts"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 1c9f35dd45fee..425fc5884cec7 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s +; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s ; TODO: Add global-isel when it can support bf16 @@ -198,19 +199,33 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { -; GCN-LABEL: fptrunc_f32_to_bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v1, v1, v0, s0 -; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f32_to_bf16: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_mov_b32_e32 v3, v2 +; GFX-940-NEXT: v_mov_b32_e32 v2, v1 +; GFX-940-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v1, v1, v0, s0 +; GFX-940-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f32_to_bf16: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_mov_b32_e32 v3, v2 +; GFX-950-NEXT: v_mov_b32_e32 v2, v1 +; GFX-950-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v1, v1, v0, s0 +; GFX-950-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX-950-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.cvt = fptrunc float %a to bfloat store bfloat %a.cvt, ptr %out @@ -218,20 +233,35 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { -; GCN-LABEL: fptrunc_f32_to_bf16_abs: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 -; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v1, s0 -; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f32_to_bf16_abs: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_mov_b32_e32 v3, v2 +; GFX-940-NEXT: v_mov_b32_e32 v2, v1 +; GFX-940-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f32_to_bf16_abs: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_mov_b32_e32 v3, v2 +; GFX-950-NEXT: v_mov_b32_e32 v2, v1 +; GFX-950-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; GFX-950-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-950-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-950-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.abs = call float @llvm.fabs.f32(float %a) %a.cvt = fptrunc float %a.abs to bfloat @@ -240,20 +270,35 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { -; GCN-LABEL: fptrunc_f32_to_bf16_neg: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 -; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v1, s0 -; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f32_to_bf16_neg: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_mov_b32_e32 v3, v2 +; GFX-940-NEXT: v_mov_b32_e32 v2, v1 +; GFX-940-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f32_to_bf16_neg: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_mov_b32_e32 v3, v2 +; GFX-950-NEXT: v_mov_b32_e32 v2, v1 +; GFX-950-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GFX-950-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-950-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-950-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.neg = fneg float %a %a.cvt = fptrunc float %a.neg to bfloat @@ -262,29 +307,53 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { -; GCN-LABEL: fptrunc_f64_to_bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| -; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GCN-NEXT: v_and_b32_e32 v7, 1, v6 -; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GCN-NEXT: v_add_u32_e32 v4, v6, v4 -; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GCN-NEXT: s_brev_b32 s0, 1 -; GCN-NEXT: v_and_or_b32 v5, v1, s0, v4 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 -; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f64_to_bf16: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX-940-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-940-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX-940-NEXT: s_brev_b32 s0, 1 +; GFX-940-NEXT: v_and_or_b32 v5, v1, s0, v4 +; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f64_to_bf16: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX-950-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX-950-NEXT: s_brev_b32 s0, 1 +; GFX-950-NEXT: v_and_or_b32 v5, v1, s0, v4 +; GFX-950-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-950-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.cvt = fptrunc double %a to bfloat store bfloat %a.cvt, ptr %out @@ -292,30 +361,55 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { -; GCN-LABEL: fptrunc_f64_to_bf16_neg: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| -; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GCN-NEXT: v_and_b32_e32 v8, 1, v7 -; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GCN-NEXT: v_add_u32_e32 v4, v7, v4 -; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc -; GCN-NEXT: s_brev_b32 s4, 1 -; GCN-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: v_and_or_b32 v5, v6, s4, v4 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 -; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GCN-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f64_to_bf16_neg: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-940-NEXT: s_brev_b32 s4, 1 +; GFX-940-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 +; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-940-NEXT: v_and_or_b32 v5, v6, s4, v4 +; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f64_to_bf16_neg: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-950-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: s_brev_b32 s4, 1 +; GFX-950-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 +; GFX-950-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-950-NEXT: v_and_or_b32 v5, v6, s4, v4 +; GFX-950-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-950-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-950-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.neg = fneg double %a %a.cvt = fptrunc double %a.neg to bfloat @@ -324,30 +418,55 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { -; GCN-LABEL: fptrunc_f64_to_bf16_abs: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| -; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GCN-NEXT: v_and_b32_e32 v8, 1, v7 -; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GCN-NEXT: v_add_u32_e32 v4, v7, v4 -; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc -; GCN-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: s_brev_b32 s0, 1 -; GCN-NEXT: v_and_or_b32 v5, v6, s0, v4 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 -; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GCN-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f64_to_bf16_abs: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-940-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-940-NEXT: s_brev_b32 s0, 1 +; GFX-940-NEXT: v_and_or_b32 v5, v6, s0, v4 +; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f64_to_bf16_abs: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-950-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; GFX-950-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-950-NEXT: s_brev_b32 s0, 1 +; GFX-950-NEXT: v_and_or_b32 v5, v6, s0, v4 +; GFX-950-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-950-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-950-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.abs = call double @llvm.fabs.f64(double %a) %a.cvt = fptrunc double %a.abs to bfloat diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll index 4eac26e853c2a..b64968c9336b9 100644 --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -80,6 +80,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX942-NOXNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX942-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX950-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX950-XNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX1010 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX1010-NOXNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX1010-XNACK %s @@ -180,6 +183,9 @@ ; GFX942: .amdgcn_target "amdgcn-amd-amdhsa--gfx942" ; GFX942-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack-" ; GFX942-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack+" +; GFX950: .amdgcn_target "amdgcn-amd-amdhsa--gfx950" +; GFX950-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack-" +; GFX950-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack+" ; GFX1010: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010" ; GFX1010-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack-" ; GFX1010-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack+" diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll index f1f4edb94a617..99344f16d4cd6 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -57,6 +57,7 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX940 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx941 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX941 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx942 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX942 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX950 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1010 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1011 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1012 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1012 %s @@ -139,6 +140,7 @@ ; GFX940: EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40) ; GFX941: EF_AMDGPU_MACH_AMDGCN_GFX941 (0x4B) ; GFX942: EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C) +; GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F) ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) ; GFX1011: EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34) ; GFX1012: EF_AMDGPU_MACH_AMDGCN_GFX1012 (0x35) diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll index 961b89ab28f62..3ad2a9df764be 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll @@ -12,6 +12,9 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s + ; NO-SRAM-ECC-GFX906: Flags [ ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_FEATURE_XNACK_V3 (0x100) ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) @@ -44,6 +47,11 @@ ; SRAM-ECC-GFX940: EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40) ; SRAM-ECC-GFX940: ] +; SRAM-ECC-GFX950: Flags [ +; SRAM-ECC-GFX950: EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200) +; SRAM-ECC-GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F) +; SRAM-ECC-GFX950: ] + define amdgpu_kernel void @elf_header() { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 27282a453075b..08122cd0d89ea 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s define float @v_fmaximum3_f32(float %a, float %b, float %c) { ; GFX12-LABEL: v_fmaximum3_f32: @@ -19,9 +20,11 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -46,9 +49,11 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -71,10 +76,13 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre ; GFX9-NEXT: v_max_f32_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -101,9 +109,11 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -129,9 +139,11 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, v0, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fabs = call float @llvm.fabs.f32(float %b) @@ -157,9 +169,11 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fabs = call float @llvm.fabs.f32(float %c) @@ -185,9 +199,11 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, |v0|, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -215,9 +231,11 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, -v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a @@ -245,9 +263,11 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, -|v0|, -|v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, -|v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -278,9 +298,11 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, -v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a @@ -306,9 +328,11 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg float %b @@ -334,9 +358,11 @@ define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg float %c @@ -362,9 +388,11 @@ define float @v_fmaximum3_f32_const0(float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v2, 0x41000000, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float 8.0, float %b) @@ -389,9 +417,11 @@ define float @v_fmaximum3_f32__const2(float %a, float %b) { ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -416,9 +446,11 @@ define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float 4.0, float %b) @@ -443,9 +475,11 @@ define float @v_fmaximum3_f32__inlineimm(float %a, float %b) { ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -472,9 +506,11 @@ define float @v_fmaximum3_f32_const1_const2(float %a) { ; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v1, 0x41800000, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float 8.0) @@ -500,15 +536,19 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float ; GFX9-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v4, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v5, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) @@ -534,15 +574,19 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 ; GFX9-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) @@ -568,15 +612,19 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, ; GFX9-NEXT: v_max_f32_e64 v6, |v1|, |v3| ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e64 v3, |v0|, |v2| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v2, v0, |v4| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_max_f32_e64 v2, v1, |v5| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) @@ -605,15 +653,19 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, ; GFX9-NEXT: v_max_f32_e64 v6, -v1, -v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e64 v3, -v0, -v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v2, v0, -v4 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_max_f32_e64 v2, v1, -v5 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x float> %a @@ -642,15 +694,19 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c ; GFX9-NEXT: v_max_f32_e32 v4, 2.0, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v4, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> ) @@ -676,15 +732,19 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b ; GFX9-NEXT: v_max_f32_e32 v4, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) @@ -711,21 +771,27 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float ; GFX9-NEXT: v_max_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v6, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v7, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v8, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) @@ -752,21 +818,27 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 ; GFX9-NEXT: v_max_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v0, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v1, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v2, v8 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) @@ -793,21 +865,27 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, ; GFX9-NEXT: v_max_f32_e64 v9, |v2|, |v5| ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_max_f32_e64 v5, |v1|, |v4| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_max_f32_e64 v4, |v0|, |v3| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v0, |v6| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v1, |v7| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v2, |v8| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) @@ -837,21 +915,27 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, ; GFX9-NEXT: v_max_f32_e64 v9, -v2, -v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_max_f32_e64 v5, -v1, -v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_max_f32_e64 v4, -v0, -v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v0, -v6 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v1, -v7 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v2, -v8 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x float> %a @@ -881,21 +965,27 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c ; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> ) @@ -922,21 +1012,27 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b ; GFX9-NEXT: v_max_f32_e32 v6, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) @@ -962,9 +1058,11 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -989,9 +1087,11 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1016,11 +1116,14 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1048,9 +1151,11 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1076,9 +1181,11 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, v0, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) @@ -1104,9 +1211,11 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1132,9 +1241,11 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, |v0|, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1162,9 +1273,11 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, -v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a @@ -1192,9 +1305,11 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, -|v0|, -|v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, -|v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1225,9 +1340,11 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, -v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a @@ -1253,9 +1370,11 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b @@ -1281,9 +1400,11 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c @@ -1309,9 +1430,11 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v2, 0x4800, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 8.0, half %b) @@ -1336,9 +1459,11 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) { ; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1363,9 +1488,11 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v2, 4.0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 4.0, half %b) @@ -1390,9 +1517,11 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) { ; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f16_e32 v1, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1419,9 +1548,11 @@ define half @v_fmaximum3_f16_const1_const2(half %a) { ; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f16_e32 v1, 0x4c00, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half 8.0) @@ -1448,19 +1579,23 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0) @@ -1486,19 +1621,23 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -1527,22 +1666,25 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 ; GFX9-NEXT: v_pk_max_f16 v3, v3, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX9-NEXT: v_perm_b32 v1, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v0, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) @@ -1571,19 +1713,23 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x half> %a %b.fneg = fneg <2 x half> %b @@ -1610,21 +1756,25 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v2, v3, v0, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> ) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -1650,19 +1800,23 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX9-NEXT: v_pk_max_f16 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v4, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> ) @@ -1690,29 +1844,35 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v5, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0) @@ -1740,29 +1900,35 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -1799,33 +1965,37 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v8 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 -; GFX9-NEXT: v_perm_b32 v2, v8, v0, s4 +; GFX9-NEXT: v_perm_b32 v2, v8, v0, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v6, v9, v1, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v6, v9, v1, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX9-NEXT: v_pk_max_f16 v6, v6, v10 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) @@ -1856,29 +2026,35 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x half> %a %b.fneg = fneg <3 x half> %b @@ -1907,29 +2083,34 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_pk_max_f16 v7, v1, 2.0 +; GFX9-NEXT: s_mov_b32 s1, 0x5040100 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x7e00 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: s_mov_b32 s5, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v5, v0, s5 +; GFX9-NEXT: v_perm_b32 v4, v5, v0, s1 ; GFX9-NEXT: v_pk_max_f16 v4, v4, v2 -; GFX9-NEXT: s_movk_i32 s4, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_pack_b32_f16 v7, v1, s4 +; GFX9-NEXT: v_pack_b32_f16 v7, v1, s0 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v3 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s5 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> ) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -1957,29 +2138,35 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX9-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 4.0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> ) @@ -2007,33 +2194,41 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v5, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0) @@ -2061,33 +2256,41 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v5 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2124,37 +2327,43 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 ; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v8 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 -; GFX9-NEXT: v_perm_b32 v2, v8, v1, s4 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 +; GFX9-NEXT: v_perm_b32 v2, v8, v1, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX9-NEXT: v_perm_b32 v6, v9, v0, s4 +; GFX9-NEXT: v_perm_b32 v6, v9, v0, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) @@ -2185,33 +2394,41 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <4 x half> %a %b.fneg = fneg <4 x half> %b @@ -2240,35 +2457,41 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v8, v1, s4 +; GFX9-NEXT: v_perm_b32 v4, v8, v1, s0 ; GFX9-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v8, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v8, v5, v0, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_pk_max_f16 v8, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v7, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> ) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2296,33 +2519,41 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX9-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v4, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> ) @@ -2346,12 +2577,14 @@ define double @v_fmaximum3_f64(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2377,12 +2610,14 @@ define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2404,19 +2639,20 @@ define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, do ; ; GFX9-LABEL: s_fmaximum3_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX9-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: ; return to shader part epilog %max0 = call double @llvm.maximum.f64(double %a, double %b) %max1 = call double @llvm.maximum.f64(double %max0, double %c) @@ -2447,12 +2683,14 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2479,12 +2717,14 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], |v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2511,12 +2751,14 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2543,12 +2785,14 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, |v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2577,12 +2821,14 @@ define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], -v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2611,12 +2857,14 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -|v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2648,12 +2896,14 @@ define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2680,12 +2930,14 @@ define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], -v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2712,12 +2964,14 @@ define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2743,15 +2997,17 @@ define double @v_fmaximum3_f64_const0(double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_const0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], s[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2777,14 +3033,15 @@ define double @v_fmaximum3_f64__const2(double %a, double %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2810,12 +3067,14 @@ define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], 4.0 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2841,12 +3100,14 @@ define double @v_fmaximum3_f64__inlineimm(double %a, double %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], 4.0 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2871,17 +3132,18 @@ define double @v_fmaximum3_f64_const1_const2(double %a) { ; GFX9-LABEL: v_fmaximum3_f64_const1_const2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40300000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40300000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2909,9 +3171,11 @@ define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c) ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -2935,11 +3199,14 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float ; GFX9-NEXT: v_max_f32_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -2973,9 +3240,11 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3002,11 +3271,13 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in ; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 @@ -3043,19 +3314,23 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX9-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.f16(<2 x half> %max0, <2 x half> %c) @@ -3080,12 +3355,14 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3095,3 +3372,6 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 ret <2 x double> %insert.1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX940: {{.*}} +; GFX950: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index d9ba2de48bb01..43293512c8c21 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s define float @v_fminimum3_f32(float %a, float %b, float %c) { ; GFX12-LABEL: v_fminimum3_f32: @@ -19,9 +20,11 @@ define float @v_fminimum3_f32(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -46,9 +49,11 @@ define float @v_fminimum3_f32_commute(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -71,10 +76,13 @@ define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inre ; GFX9-NEXT: v_min_f32_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f32_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -101,9 +109,11 @@ define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -129,9 +139,11 @@ define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, v0, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fabs = call float @llvm.fabs.f32(float %b) @@ -157,9 +169,11 @@ define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fabs = call float @llvm.fabs.f32(float %c) @@ -185,9 +199,11 @@ define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, |v0|, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -215,9 +231,11 @@ define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, -v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a @@ -245,9 +263,11 @@ define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, -|v0|, -|v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, -|v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -278,9 +298,11 @@ define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, -v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a @@ -306,9 +328,11 @@ define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg float %b @@ -334,9 +358,11 @@ define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg float %c @@ -362,9 +388,11 @@ define float @v_fminimum3_f32_const0(float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v2, 0x41000000, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float 8.0, float %b) @@ -389,9 +417,11 @@ define float @v_fminimum3_f32__const2(float %a, float %b) { ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v1, 0x41000000, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -416,9 +446,11 @@ define float @v_fminimum3_f32_inlineimm0(float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float 4.0, float %b) @@ -443,9 +475,11 @@ define float @v_fminimum3_f32__inlineimm(float %a, float %b) { ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v1, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -472,9 +506,11 @@ define float @v_fminimum3_f32_const1_const2(float %a) { ; GFX9-NEXT: v_min_f32_e32 v1, 0x41000000, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float 8.0) @@ -500,15 +536,19 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float ; GFX9-NEXT: v_min_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v4, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v5, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) @@ -534,15 +574,19 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 ; GFX9-NEXT: v_min_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v0, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) @@ -568,15 +612,19 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, ; GFX9-NEXT: v_min_f32_e64 v6, |v1|, |v3| ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e64 v3, |v0|, |v2| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v2, v0, |v4| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_min_f32_e64 v2, v1, |v5| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) @@ -605,15 +653,19 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, ; GFX9-NEXT: v_min_f32_e64 v6, -v1, -v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e64 v3, -v0, -v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v2, v0, -v4 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_min_f32_e64 v2, v1, -v5 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x float> %a @@ -642,15 +694,19 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c ; GFX9-NEXT: v_min_f32_e32 v4, 2.0, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v4, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> ) @@ -676,15 +732,19 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b ; GFX9-NEXT: v_min_f32_e32 v4, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) @@ -711,21 +771,27 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float ; GFX9-NEXT: v_min_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v6, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v7, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v8, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) @@ -752,21 +818,27 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 ; GFX9-NEXT: v_min_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v0, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v1, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v2, v8 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) @@ -793,21 +865,27 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, ; GFX9-NEXT: v_min_f32_e64 v9, |v2|, |v5| ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_min_f32_e64 v5, |v1|, |v4| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_min_f32_e64 v4, |v0|, |v3| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v0, |v6| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v1, |v7| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v2, |v8| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) @@ -837,21 +915,27 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, ; GFX9-NEXT: v_min_f32_e64 v9, -v2, -v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_min_f32_e64 v5, -v1, -v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_min_f32_e64 v4, -v0, -v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v0, -v6 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v1, -v7 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v2, -v8 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x float> %a @@ -881,21 +965,27 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c ; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> ) @@ -922,21 +1012,27 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b ; GFX9-NEXT: v_min_f32_e32 v6, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) @@ -962,9 +1058,11 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -989,9 +1087,11 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1016,11 +1116,14 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1048,9 +1151,11 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1076,9 +1181,11 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, v0, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) @@ -1104,9 +1211,11 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1132,9 +1241,11 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, |v0|, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1162,9 +1273,11 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, -v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a @@ -1192,9 +1305,11 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, -|v0|, -|v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, -|v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1225,9 +1340,11 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, -v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a @@ -1253,9 +1370,11 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b @@ -1281,9 +1400,11 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c @@ -1309,9 +1430,11 @@ define half @v_fminimum3_f16_const0(half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v2, 0x4800, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 8.0, half %b) @@ -1336,9 +1459,11 @@ define half @v_fminimum3_f16__const2(half %a, half %b) { ; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1363,9 +1488,11 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v2, 4.0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 4.0, half %b) @@ -1390,9 +1517,11 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) { ; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f16_e32 v1, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1419,9 +1548,11 @@ define half @v_fminimum3_f16_const1_const2(half %a) { ; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f16_e32 v1, 0x4c00, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half 8.0) @@ -1448,19 +1579,23 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0) @@ -1486,19 +1621,23 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -1527,22 +1666,25 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 ; GFX9-NEXT: v_pk_min_f16 v3, v3, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX9-NEXT: v_perm_b32 v1, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v0, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) @@ -1571,19 +1713,23 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x half> %a %b.fneg = fneg <2 x half> %b @@ -1610,21 +1756,25 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v2, v3, v0, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> ) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -1650,19 +1800,23 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX9-NEXT: v_pk_min_f16 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v4, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> ) @@ -1690,29 +1844,35 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v5, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0) @@ -1740,29 +1900,35 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -1799,33 +1965,37 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 ; GFX9-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_min_f16 v6, v6, v8 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 -; GFX9-NEXT: v_perm_b32 v2, v8, v0, s4 +; GFX9-NEXT: v_perm_b32 v2, v8, v0, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v6, v9, v1, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v6, v9, v1, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX9-NEXT: v_pk_min_f16 v6, v6, v10 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) @@ -1856,29 +2026,35 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x half> %a %b.fneg = fneg <3 x half> %b @@ -1907,29 +2083,34 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_pk_min_f16 v7, v1, 2.0 +; GFX9-NEXT: s_mov_b32 s1, 0x5040100 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x7e00 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: s_mov_b32 s5, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v5, v0, s5 +; GFX9-NEXT: v_perm_b32 v4, v5, v0, s1 ; GFX9-NEXT: v_pk_min_f16 v4, v4, v2 -; GFX9-NEXT: s_movk_i32 s4, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_pack_b32_f16 v7, v1, s4 +; GFX9-NEXT: v_pack_b32_f16 v7, v1, s0 ; GFX9-NEXT: v_pk_min_f16 v7, v7, v3 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s5 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> ) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -1957,29 +2138,35 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX9-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 4.0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> ) @@ -2007,33 +2194,41 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v5, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0) @@ -2061,33 +2256,41 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v5 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2124,37 +2327,43 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 ; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX9-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_min_f16 v6, v6, v8 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 -; GFX9-NEXT: v_perm_b32 v2, v8, v1, s4 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 +; GFX9-NEXT: v_perm_b32 v2, v8, v1, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX9-NEXT: v_perm_b32 v6, v9, v0, s4 +; GFX9-NEXT: v_perm_b32 v6, v9, v0, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) @@ -2185,33 +2394,41 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <4 x half> %a %b.fneg = fneg <4 x half> %b @@ -2240,35 +2457,41 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v8, v1, s4 +; GFX9-NEXT: v_perm_b32 v4, v8, v1, s0 ; GFX9-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v8, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v8, v5, v0, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_pk_min_f16 v8, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v7, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> ) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2296,33 +2519,41 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX9-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v4, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> ) @@ -2346,12 +2577,14 @@ define double @v_fminimum3_f64(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2377,12 +2610,14 @@ define double @v_fminimum3_f64_commute(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2404,19 +2639,20 @@ define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, do ; ; GFX9-LABEL: s_fminimum3_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX9-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: ; return to shader part epilog %max0 = call double @llvm.minimum.f64(double %a, double %b) %max1 = call double @llvm.minimum.f64(double %max0, double %c) @@ -2447,12 +2683,14 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2479,12 +2717,14 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], |v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2511,12 +2751,14 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2543,12 +2785,14 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, |v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2577,12 +2821,14 @@ define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], -v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2611,12 +2857,14 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -|v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2648,12 +2896,14 @@ define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2680,12 +2930,14 @@ define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], -v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2712,12 +2964,14 @@ define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2743,15 +2997,17 @@ define double @v_fminimum3_f64_const0(double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_const0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], s[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 +; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2777,14 +3033,15 @@ define double @v_fminimum3_f64__const2(double %a, double %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2810,12 +3067,14 @@ define double @v_fminimum3_f64_inlineimm0(double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], 4.0 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2841,12 +3100,14 @@ define double @v_fminimum3_f64__inlineimm(double %a, double %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2871,17 +3132,18 @@ define double @v_fminimum3_f64_const1_const2(double %a) { ; GFX9-LABEL: v_fminimum3_f64_const1_const2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40300000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40300000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2909,9 +3171,11 @@ define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c) ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -2935,11 +3199,14 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float ; GFX9-NEXT: v_min_f32_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f32_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -2973,9 +3240,11 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3002,11 +3271,13 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in ; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 @@ -3043,19 +3314,23 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX9-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.f16(<2 x half> %max0, <2 x half> %c) @@ -3080,12 +3355,14 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3095,3 +3372,6 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 ret <2 x double> %insert.1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX940: {{.*}} +; GFX950: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index 8313f5b655efb..bd35ee3f00973 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index d90c4a75ac5de..e782f53cee608 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -30,24 +30,24 @@ define half @v_maximum_f16(half %src0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16: ; GFX10: ; %bb.0: @@ -102,12 +102,6 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) { ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -156,24 +150,24 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nsz: ; GFX10: ; %bb.0: @@ -228,12 +222,6 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) { ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -284,26 +272,26 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f16__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX940-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX900-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_src0: ; GFX10: ; %bb.0: @@ -365,26 +353,26 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f16__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX940-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX900-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_src1: ; GFX10: ; %bb.0: @@ -453,34 +441,34 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_max_f16_e32 v1, s16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_max_f16_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_max_f16_e32 v1, s16, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_max_f16_e32 v1, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_f16: ; GFX10: ; %bb.0: @@ -567,35 +555,35 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f16: ; GFX10: ; %bb.0: @@ -668,12 +656,6 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -736,35 +718,35 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f16__nsz: ; GFX10: ; %bb.0: @@ -837,12 +819,6 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -917,50 +893,50 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_lshr_b32 s4, s17, 16 -; GFX9-NEXT: v_pk_max_f16 v1, s16, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 -; GFX9-NEXT: s_lshr_b32 s5, s16, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: s_lshr_b32 s1, s1, 16 -; GFX940-NEXT: v_pk_max_f16 v1, s0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX940-NEXT: s_lshr_b32 s0, s0, 16 -; GFX940-NEXT: v_mov_b32_e32 v3, s1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_mov_b32_e32 v1, s17 +; GFX900-NEXT: s_lshr_b32 s4, s17, 16 +; GFX900-NEXT: v_pk_max_f16 v1, s16, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX900-NEXT: s_lshr_b32 s5, s16, 16 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: s_lshr_b32 s1, s1, 16 +; GFX950-NEXT: v_pk_max_f16 v1, s0, v1 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX950-NEXT: s_lshr_b32 s0, s0, 16 +; GFX950-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_v2f16: ; GFX10: ; %bb.0: @@ -1065,41 +1041,41 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f16: ; GFX10: ; %bb.0: @@ -1187,13 +1163,6 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1269,41 +1238,41 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f16__nsz: ; GFX10: ; %bb.0: @@ -1391,13 +1360,6 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1487,51 +1449,51 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f16: ; GFX10: ; %bb.0: @@ -1635,13 +1597,6 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1731,51 +1686,51 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f16__nsz: ; GFX10: ; %bb.0: @@ -1879,13 +1834,6 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2023,83 +1971,83 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v8f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v8, v3, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX9-NEXT: v_pk_max_f16 v7, v2, v6 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX9-NEXT: v_pk_max_f16 v6, v1, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX9-NEXT: v_pk_max_f16 v5, v0, v4 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v6, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v7, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v8, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v8f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v8, v3, v7 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v7, v2, v6 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX940-NEXT: v_perm_b32 v3, v3, v10, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v6, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX940-NEXT: v_perm_b32 v2, v2, v8, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v5, v0, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX940-NEXT: v_perm_b32 v1, v1, v7, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v6, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v8f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v8, v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX900-NEXT: v_pk_max_f16 v7, v2, v6 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX900-NEXT: v_pk_max_f16 v6, v1, v5 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX900-NEXT: v_pk_max_f16 v5, v0, v4 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v7, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v8, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v8f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v8, v3, v7 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v7, v2, v6 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX950-NEXT: v_perm_b32 v3, v3, v10, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v6, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX950-NEXT: v_perm_b32 v2, v2, v8, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v5, v0, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX950-NEXT: v_perm_b32 v1, v1, v7, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v6, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f16: ; GFX10: ; %bb.0: @@ -2400,147 +2348,147 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v16f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v16, v7, v15 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX9-NEXT: v_pk_max_f16 v15, v6, v14 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX9-NEXT: v_pk_max_f16 v14, v5, v13 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX9-NEXT: v_pk_max_f16 v13, v4, v12 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX9-NEXT: v_pk_max_f16 v12, v3, v11 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX9-NEXT: v_pk_max_f16 v11, v2, v10 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX9-NEXT: v_pk_max_f16 v10, v1, v9 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX9-NEXT: v_pk_max_f16 v9, v0, v8 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v10, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v11, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v12, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v13, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v14, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v15, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v16, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v18, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v16f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v16, v7, v15 -; GFX940-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v15, v6, v14 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX940-NEXT: v_perm_b32 v7, v7, v18, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v14, v5, v13 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX940-NEXT: v_perm_b32 v6, v6, v16, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v13, v4, v12 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX940-NEXT: v_perm_b32 v5, v5, v15, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v12, v3, v11 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX940-NEXT: v_perm_b32 v4, v4, v14, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v11, v2, v10 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX940-NEXT: v_perm_b32 v3, v3, v13, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v10, v1, v9 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX940-NEXT: v_perm_b32 v2, v2, v12, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v9, v0, v8 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX940-NEXT: v_perm_b32 v1, v1, v11, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v10, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v16f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v16, v7, v15 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX900-NEXT: v_pk_max_f16 v15, v6, v14 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX900-NEXT: v_pk_max_f16 v14, v5, v13 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX900-NEXT: v_pk_max_f16 v13, v4, v12 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX900-NEXT: v_pk_max_f16 v12, v3, v11 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX900-NEXT: v_pk_max_f16 v11, v2, v10 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX900-NEXT: v_pk_max_f16 v10, v1, v9 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX900-NEXT: v_pk_max_f16 v9, v0, v8 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v10, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v11, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v12, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v13, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v14, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v15, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v16, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v18, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v16f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v16, v7, v15 +; GFX950-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v15, v6, v14 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 +; GFX950-NEXT: v_perm_b32 v7, v7, v18, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v14, v5, v13 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 +; GFX950-NEXT: v_perm_b32 v6, v6, v16, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v13, v4, v12 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 +; GFX950-NEXT: v_perm_b32 v5, v5, v15, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v12, v3, v11 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX950-NEXT: v_perm_b32 v4, v4, v14, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v11, v2, v10 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX950-NEXT: v_perm_b32 v3, v3, v13, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v10, v1, v9 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 +; GFX950-NEXT: v_perm_b32 v2, v2, v12, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v9, v0, v8 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 +; GFX950-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v10, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f16: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index 48851cb030233..c1fdfa2c4cf9a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -26,24 +27,24 @@ define float @v_maximum_f32(float %src0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f32: ; GFX10: ; %bb.0: @@ -94,12 +95,6 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -144,24 +139,24 @@ define float @v_maximum_f32__nsz(float %src0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f32__nsz: ; GFX10: ; %bb.0: @@ -212,12 +207,6 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -264,26 +253,26 @@ define float @v_maximum_f32__nnan_src0(float %arg0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f32__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f32__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f32__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f32__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX950-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f32__nnan_src0: ; GFX10: ; %bb.0: @@ -341,26 +330,26 @@ define float @v_maximum_f32__nnan_src1(float %src0, float %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f32__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f32__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f32__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX900-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f32__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX950-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f32__nnan_src1: ; GFX10: ; %bb.0: @@ -424,32 +413,32 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_max_f32_e32 v1, s16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_max_f32_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_max_f32_e32 v1, s16, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_max_f32_e32 v1, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_f32: ; GFX10: ; %bb.0: @@ -517,31 +506,31 @@ define <2 x float> @v_maximum_v2f32(<2 x float> %src0, <2 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f32: ; GFX10: ; %bb.0: @@ -601,13 +590,6 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -660,31 +642,31 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f32__nsz: ; GFX10: ; %bb.0: @@ -744,13 +726,6 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -813,40 +788,40 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s19 -; GFX9-NEXT: v_max_f32_e32 v1, s17, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_max_f32_e32 v3, s16, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:1] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s3 -; GFX940-NEXT: v_max_f32_e32 v1, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_max_f32_e32 v3, s0, v0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s19 +; GFX900-NEXT: v_max_f32_e32 v1, s17, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX900-NEXT: v_mov_b32_e32 v0, s18 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_max_f32_e32 v3, s16, v0 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_v2f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s3 +; GFX950-NEXT: v_max_f32_e32 v1, s1, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 +; GFX950-NEXT: v_mov_b32_e32 v0, s2 +; GFX950-NEXT: v_max_f32_e32 v3, s0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_v2f32: ; GFX10: ; %bb.0: @@ -927,38 +902,38 @@ define <3 x float> @v_maximum_v3f32(<3 x float> %src0, <3 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX900-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX950-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX950-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f32: ; GFX10: ; %bb.0: @@ -1028,14 +1003,6 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1) ; GFX9-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1097,38 +1064,38 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX900-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX950-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX950-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f32__nsz: ; GFX10: ; %bb.0: @@ -1198,14 +1165,6 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr ; GFX9-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1273,45 +1232,45 @@ define <4 x float> @v_maximum_v4f32(<4 x float> %src0, <4 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v3, v7 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX950-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX950-NEXT: v_max_f32_e32 v4, v3, v7 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f32: ; GFX10: ; %bb.0: @@ -1391,15 +1350,6 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1) ; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1469,45 +1419,45 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v3, v7 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX950-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX950-NEXT: v_max_f32_e32 v4, v3, v7 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f32__nsz: ; GFX10: ; %bb.0: @@ -1587,15 +1537,6 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr ; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1689,73 +1630,73 @@ define <8 x float> @v_maximum_v8f32(<8 x float> %src0, <8 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v8f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v16, v0, v8 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v1, v9 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v2, v10 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v3, v11 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v4, v12 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v5, v13 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v6, v14 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v7, v15 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v8f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v16, v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX940-NEXT: v_max_f32_e32 v8, v1, v9 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v2, v10 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v3, v11 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v4, v12 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v5, v13 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v6, v14 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v7, v15 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v8f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v16, v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v1, v9 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v2, v10 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v3, v11 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v4, v12 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v5, v13 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v6, v14 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v7, v15 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v8f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v16, v0, v8 +; GFX950-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX950-NEXT: v_max_f32_e32 v8, v1, v9 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v2, v10 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v3, v11 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v4, v12 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v5, v13 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v6, v14 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v7, v15 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f32: ; GFX10: ; %bb.0: @@ -1968,136 +1909,136 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v16f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v31, s30, 0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX9-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX9-NEXT: v_writelane_b32 v31, s31, 1 -; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 -; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 -; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 -; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 -; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 -; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 -; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_max_f32_e32 v19, v14, v30 -; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] -; GFX9-NEXT: v_readlane_b32 s31, v31, 1 -; GFX9-NEXT: v_readlane_b32 s30, v31, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v16f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_mov_b32_e32 v32, 0x7fc00000 -; GFX940-NEXT: v_max_f32_e32 v33, v0, v16 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX940-NEXT: v_max_f32_e32 v34, v1, v17 -; GFX940-NEXT: v_max_f32_e32 v35, v2, v18 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX940-NEXT: v_max_f32_e32 v36, v3, v19 -; GFX940-NEXT: v_max_f32_e32 v37, v4, v20 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX940-NEXT: v_max_f32_e32 v38, v5, v21 -; GFX940-NEXT: v_max_f32_e32 v39, v6, v22 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX940-NEXT: v_max_f32_e32 v48, v7, v23 -; GFX940-NEXT: v_max_f32_e32 v49, v8, v24 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX940-NEXT: v_max_f32_e32 v50, v9, v25 -; GFX940-NEXT: v_max_f32_e32 v51, v10, v26 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX940-NEXT: v_max_f32_e32 v52, v11, v27 -; GFX940-NEXT: v_max_f32_e32 v53, v12, v28 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX940-NEXT: v_max_f32_e32 v54, v13, v29 -; GFX940-NEXT: v_max_f32_e32 v55, v14, v30 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v16, v15, v31 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v16f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v31, s30, 0 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX900-NEXT: v_max_f32_e32 v18, v13, v29 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX900-NEXT: v_writelane_b32 v31, s31, 1 +; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX900-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_max_f32_e32 v19, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX900-NEXT: v_readlane_b32 s31, v31, 1 +; GFX900-NEXT: v_readlane_b32 s30, v31, 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v18, v15, v16 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v16f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_mov_b32_e32 v32, 0x7fc00000 +; GFX950-NEXT: v_max_f32_e32 v33, v0, v16 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 +; GFX950-NEXT: v_max_f32_e32 v34, v1, v17 +; GFX950-NEXT: v_max_f32_e32 v35, v2, v18 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 +; GFX950-NEXT: v_max_f32_e32 v36, v3, v19 +; GFX950-NEXT: v_max_f32_e32 v37, v4, v20 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 +; GFX950-NEXT: v_max_f32_e32 v38, v5, v21 +; GFX950-NEXT: v_max_f32_e32 v39, v6, v22 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 +; GFX950-NEXT: v_max_f32_e32 v48, v7, v23 +; GFX950-NEXT: v_max_f32_e32 v49, v8, v24 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 +; GFX950-NEXT: v_max_f32_e32 v50, v9, v25 +; GFX950-NEXT: v_max_f32_e32 v51, v10, v26 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 +; GFX950-NEXT: v_max_f32_e32 v52, v11, v27 +; GFX950-NEXT: v_max_f32_e32 v53, v12, v28 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 +; GFX950-NEXT: v_max_f32_e32 v54, v13, v29 +; GFX950-NEXT: v_max_f32_e32 v55, v14, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v16, v15, v31 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f32: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index 80a0a194713d9..e354ec6fb3dd7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -28,26 +29,26 @@ define double @v_maximum_f64(double %src0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f64: ; GFX10: ; %bb.0: @@ -100,12 +101,6 @@ define double @v_maximum_f64__nnan(double %src0, double %src1) { ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -152,26 +147,26 @@ define double @v_maximum_f64__nsz(double %src0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f64__nsz: ; GFX10: ; %bb.0: @@ -224,12 +219,6 @@ define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) { ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -278,28 +267,28 @@ define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f64__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f64__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f64__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX900-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f64__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f64__nnan_src0: ; GFX10: ; %bb.0: @@ -362,28 +351,28 @@ define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f64__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f64__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f64__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX900-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f64__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f64__nnan_src1: ; GFX10: ; %bb.0: @@ -454,35 +443,35 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:1] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s18 +; GFX900-NEXT: v_mov_b32_e32 v1, s19 +; GFX900-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX950-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1] +; GFX950-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_f64: ; GFX10: ; %bb.0: @@ -555,35 +544,35 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f64: ; GFX10: ; %bb.0: @@ -648,13 +637,6 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -712,35 +694,35 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f64__nsz: ; GFX10: ; %bb.0: @@ -805,13 +787,6 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -883,46 +858,46 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] -; GFX9-NEXT: v_max_f64 v[0:1], s[16:17], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:3] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_v2f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[18:19] -; GFX940-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GFX940-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s22 +; GFX900-NEXT: v_mov_b32_e32 v4, s20 +; GFX900-NEXT: v_mov_b32_e32 v1, s23 +; GFX900-NEXT: v_mov_b32_e32 v5, s21 +; GFX900-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX900-NEXT: v_max_f64 v[0:1], s[16:17], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] +; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19] +; GFX950-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX950-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_v2f64: ; GFX10: ; %bb.0: @@ -1012,44 +987,44 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc -; GFX940-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX900-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX900-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX950-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX950-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f64: ; GFX10: ; %bb.0: @@ -1125,14 +1100,6 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src ; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1201,44 +1168,44 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc -; GFX940-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX900-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX900-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX950-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX950-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f64__nsz: ; GFX10: ; %bb.0: @@ -1314,14 +1281,6 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> ; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1398,53 +1357,53 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc -; GFX940-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc -; GFX940-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX900-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX900-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX900-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX950-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX950-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX950-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f64: ; GFX10: ; %bb.0: @@ -1532,15 +1491,6 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src ; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] -; GFX940-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1620,53 +1570,53 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc -; GFX940-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc -; GFX940-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX900-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX900-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX900-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX950-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX950-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX950-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f64__nsz: ; GFX10: ; %bb.0: @@ -1754,15 +1704,6 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> ; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] -; GFX940-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1878,89 +1819,89 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v8f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_max_f64 v[32:33], v[2:3], v[18:19] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] -; GFX9-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[16:17] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] -; GFX9-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX9-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX9-NEXT: v_max_f64 v[16:17], v[8:9], v[24:25] -; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX9-NEXT: v_max_f64 v[22:23], v[10:11], v[26:27] -; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX9-NEXT: v_max_f64 v[24:25], v[12:13], v[28:29] -; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[18:19], v[14:15], v[30:31] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v8f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_mov_b32_e32 v54, 0x7ff80000 -; GFX940-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX940-NEXT: v_max_f64 v[34:35], v[2:3], v[18:19] -; GFX940-NEXT: v_max_f64 v[36:37], v[4:5], v[20:21] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] -; GFX940-NEXT: v_max_f64 v[38:39], v[6:7], v[22:23] -; GFX940-NEXT: v_max_f64 v[48:49], v[8:9], v[24:25] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] -; GFX940-NEXT: v_max_f64 v[50:51], v[10:11], v[26:27] -; GFX940-NEXT: v_max_f64 v[52:53], v[12:13], v[28:29] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[16:17], v[14:15], v[30:31] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v8f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: v_max_f64 v[32:33], v[2:3], v[18:19] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX900-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX900-NEXT: v_max_f64 v[2:3], v[0:1], v[16:17] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX900-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX900-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX900-NEXT: v_max_f64 v[16:17], v[8:9], v[24:25] +; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX900-NEXT: v_max_f64 v[22:23], v[10:11], v[26:27] +; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX900-NEXT: v_max_f64 v[24:25], v[12:13], v[28:29] +; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_max_f64 v[18:19], v[14:15], v[30:31] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v8f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_mov_b32_e32 v54, 0x7ff80000 +; GFX950-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] +; GFX950-NEXT: v_max_f64 v[34:35], v[2:3], v[18:19] +; GFX950-NEXT: v_max_f64 v[36:37], v[4:5], v[20:21] +; GFX950-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX950-NEXT: v_max_f64 v[38:39], v[6:7], v[22:23] +; GFX950-NEXT: v_max_f64 v[48:49], v[8:9], v[24:25] +; GFX950-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] +; GFX950-NEXT: v_max_f64 v[50:51], v[10:11], v[26:27] +; GFX950-NEXT: v_max_f64 v[52:53], v[12:13], v[28:29] +; GFX950-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[16:17], v[14:15], v[30:31] +; GFX950-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f64: ; GFX10: ; %bb.0: @@ -2332,295 +2273,295 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v16f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_writelane_b32 v34, s30, 0 -; GFX9-NEXT: v_writelane_b32 v34, s31, 1 -; GFX9-NEXT: v_writelane_b32 v34, s34, 2 -; GFX9-NEXT: v_writelane_b32 v34, s35, 3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] -; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] -; GFX9-NEXT: v_max_f64 v[8:9], v[8:9], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] -; GFX9-NEXT: v_max_f64 v[10:11], v[10:11], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] -; GFX9-NEXT: v_max_f64 v[12:13], v[12:13], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] -; GFX9-NEXT: v_max_f64 v[14:15], v[14:15], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] -; GFX9-NEXT: v_max_f64 v[16:17], v[16:17], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] -; GFX9-NEXT: v_max_f64 v[18:19], v[18:19], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] -; GFX9-NEXT: v_max_f64 v[20:21], v[20:21], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] -; GFX9-NEXT: v_max_f64 v[22:23], v[22:23], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] -; GFX9-NEXT: v_max_f64 v[24:25], v[24:25], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] -; GFX9-NEXT: v_max_f64 v[26:27], v[26:27], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] -; GFX9-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] -; GFX9-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] -; GFX9-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX9-NEXT: v_readlane_b32 s35, v34, 3 -; GFX9-NEXT: v_readlane_b32 s34, v34, 2 -; GFX9-NEXT: v_readlane_b32 s31, v34, 1 -; GFX9-NEXT: v_readlane_b32 s30, v34, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v16f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse -; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:16 -; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:12 -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:24 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:20 -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:32 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:28 -; GFX940-NEXT: scratch_load_dword v57, off, s32 offset:8 -; GFX940-NEXT: scratch_load_dword v56, off, s32 offset:4 -; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:40 -; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:36 -; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:48 -; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:44 -; GFX940-NEXT: scratch_load_dword v43, off, s32 offset:56 -; GFX940-NEXT: scratch_load_dword v42, off, s32 offset:52 -; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:64 -; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:60 -; GFX940-NEXT: scratch_load_dword v55, off, s32 offset:72 -; GFX940-NEXT: scratch_load_dword v54, off, s32 offset:68 -; GFX940-NEXT: scratch_load_dword v53, off, s32 offset:80 -; GFX940-NEXT: scratch_load_dword v52, off, s32 offset:76 -; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:88 -; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:84 -; GFX940-NEXT: scratch_load_dword v35, off, s32 offset:96 -; GFX940-NEXT: scratch_load_dword v34, off, s32 offset:92 -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: scratch_load_dword v33, off, s32 offset:104 -; GFX940-NEXT: scratch_load_dword v32, off, s32 offset:100 -; GFX940-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_max_f64 v[58:59], v[2:3], v[36:37] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] -; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:112 -; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:108 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_max_f64 v[60:61], v[4:5], v[38:39] -; GFX940-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:120 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:116 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_max_f64 v[62:63], v[6:7], v[48:49] -; GFX940-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:128 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:124 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_max_f64 v[2:3], v[0:1], v[56:57] -; GFX940-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] -; GFX940-NEXT: v_mov_b32_e32 v0, 0x7ff80000 -; GFX940-NEXT: s_waitcnt vmcnt(23) -; GFX940-NEXT: v_max_f64 v[56:57], v[8:9], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] -; GFX940-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX940-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] -; GFX940-NEXT: s_waitcnt vmcnt(21) -; GFX940-NEXT: v_max_f64 v[46:47], v[10:11], v[44:45] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] -; GFX940-NEXT: s_waitcnt vmcnt(19) -; GFX940-NEXT: v_max_f64 v[44:45], v[12:13], v[42:43] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] -; GFX940-NEXT: s_waitcnt vmcnt(17) -; GFX940-NEXT: v_max_f64 v[42:43], v[14:15], v[40:41] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] -; GFX940-NEXT: s_waitcnt vmcnt(15) -; GFX940-NEXT: v_max_f64 v[40:41], v[16:17], v[54:55] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] -; GFX940-NEXT: s_waitcnt vmcnt(13) -; GFX940-NEXT: v_max_f64 v[54:55], v[18:19], v[52:53] -; GFX940-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] -; GFX940-NEXT: s_waitcnt vmcnt(11) -; GFX940-NEXT: v_max_f64 v[52:53], v[20:21], v[50:51] -; GFX940-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] -; GFX940-NEXT: s_waitcnt vmcnt(9) -; GFX940-NEXT: v_max_f64 v[50:51], v[22:23], v[34:35] -; GFX940-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] -; GFX940-NEXT: s_waitcnt vmcnt(6) -; GFX940-NEXT: v_max_f64 v[34:35], v[24:25], v[32:33] -; GFX940-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] -; GFX940-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc -; GFX940-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse -; GFX940-NEXT: s_waitcnt vmcnt(4) -; GFX940-NEXT: v_max_f64 v[32:33], v[26:27], v[36:37] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc -; GFX940-NEXT: s_waitcnt vmcnt(2) -; GFX940-NEXT: v_max_f64 v[32:33], v[28:29], v[38:39] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[32:33], v[30:31], v[48:49] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc -; GFX940-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v16f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_writelane_b32 v34, s30, 0 +; GFX900-NEXT: v_writelane_b32 v34, s31, 1 +; GFX900-NEXT: v_writelane_b32 v34, s34, 2 +; GFX900-NEXT: v_writelane_b32 v34, s35, 3 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX900-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX900-NEXT: v_max_f64 v[2:3], v[2:3], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX900-NEXT: v_max_f64 v[4:5], v[4:5], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX900-NEXT: v_max_f64 v[6:7], v[6:7], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX900-NEXT: v_max_f64 v[8:9], v[8:9], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX900-NEXT: v_max_f64 v[10:11], v[10:11], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX900-NEXT: v_max_f64 v[12:13], v[12:13], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX900-NEXT: v_max_f64 v[14:15], v[14:15], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX900-NEXT: v_max_f64 v[16:17], v[16:17], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX900-NEXT: v_max_f64 v[18:19], v[18:19], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX900-NEXT: v_max_f64 v[20:21], v[20:21], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX900-NEXT: v_max_f64 v[22:23], v[22:23], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX900-NEXT: v_max_f64 v[24:25], v[24:25], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX900-NEXT: v_max_f64 v[26:27], v[26:27], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX900-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX900-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] +; GFX900-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX900-NEXT: v_readlane_b32 s35, v34, 3 +; GFX900-NEXT: v_readlane_b32 s34, v34, 2 +; GFX900-NEXT: v_readlane_b32 s31, v34, 1 +; GFX900-NEXT: v_readlane_b32 s30, v34, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v16f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:16 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:12 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:20 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:72 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:68 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:80 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:76 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:88 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:96 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:92 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_max_f64 v[58:59], v[2:3], v[36:37] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:108 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_max_f64 v[60:61], v[4:5], v[38:39] +; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:120 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_max_f64 v[62:63], v[6:7], v[48:49] +; GFX950-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:128 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:124 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[56:57] +; GFX950-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] +; GFX950-NEXT: v_mov_b32_e32 v0, 0x7ff80000 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_max_f64 v[56:57], v[8:9], v[46:47] +; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] +; GFX950-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] +; GFX950-NEXT: s_waitcnt vmcnt(21) +; GFX950-NEXT: v_max_f64 v[46:47], v[10:11], v[44:45] +; GFX950-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] +; GFX950-NEXT: s_waitcnt vmcnt(19) +; GFX950-NEXT: v_max_f64 v[44:45], v[12:13], v[42:43] +; GFX950-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] +; GFX950-NEXT: s_waitcnt vmcnt(17) +; GFX950-NEXT: v_max_f64 v[42:43], v[14:15], v[40:41] +; GFX950-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] +; GFX950-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] +; GFX950-NEXT: s_waitcnt vmcnt(15) +; GFX950-NEXT: v_max_f64 v[40:41], v[16:17], v[54:55] +; GFX950-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] +; GFX950-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] +; GFX950-NEXT: s_waitcnt vmcnt(13) +; GFX950-NEXT: v_max_f64 v[54:55], v[18:19], v[52:53] +; GFX950-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] +; GFX950-NEXT: s_waitcnt vmcnt(11) +; GFX950-NEXT: v_max_f64 v[52:53], v[20:21], v[50:51] +; GFX950-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] +; GFX950-NEXT: s_waitcnt vmcnt(9) +; GFX950-NEXT: v_max_f64 v[50:51], v[22:23], v[34:35] +; GFX950-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_max_f64 v[34:35], v[24:25], v[32:33] +; GFX950-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] +; GFX950-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc +; GFX950-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: v_max_f64 v[32:33], v[26:27], v[36:37] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_max_f64 v[32:33], v[28:29], v[38:39] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[32:33], v[30:31], v[48:49] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f64: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index a74043378a259..329a85f91c251 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -2,7 +2,8 @@ ; xUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -17,24 +18,24 @@ define half @v_minimum_f16(half %src0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16: ; GFX10: ; %bb.0: @@ -79,12 +80,6 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) { ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -120,24 +115,24 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nsz: ; GFX10: ; %bb.0: @@ -182,12 +177,6 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) { ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -224,26 +213,26 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f16__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX940-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX900-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_src0: ; GFX10: ; %bb.0: @@ -291,26 +280,26 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f16__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX940-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX900-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_src1: ; GFX10: ; %bb.0: @@ -362,34 +351,34 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_min_f16_e32 v1, s16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_min_f16_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_min_f16_e32 v1, s16, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_min_f16_e32 v1, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_f16: ; GFX10: ; %bb.0: @@ -456,35 +445,35 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f16: ; GFX10: ; %bb.0: @@ -542,12 +531,6 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -590,35 +573,35 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f16__nsz: ; GFX10: ; %bb.0: @@ -676,12 +659,6 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -729,50 +706,50 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_lshr_b32 s4, s17, 16 -; GFX9-NEXT: v_pk_min_f16 v1, s16, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 -; GFX9-NEXT: s_lshr_b32 s5, s16, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: s_lshr_b32 s1, s1, 16 -; GFX940-NEXT: v_pk_min_f16 v1, s0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX940-NEXT: s_lshr_b32 s0, s0, 16 -; GFX940-NEXT: v_mov_b32_e32 v3, s1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_mov_b32_e32 v1, s17 +; GFX900-NEXT: s_lshr_b32 s4, s17, 16 +; GFX900-NEXT: v_pk_min_f16 v1, s16, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX900-NEXT: s_lshr_b32 s5, s16, 16 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: s_lshr_b32 s1, s1, 16 +; GFX950-NEXT: v_pk_min_f16 v1, s0, v1 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX950-NEXT: s_lshr_b32 s0, s0, 16 +; GFX950-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_v2f16: ; GFX10: ; %bb.0: @@ -850,41 +827,41 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f16: ; GFX10: ; %bb.0: @@ -952,13 +929,6 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1007,41 +977,41 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f16__nsz: ; GFX10: ; %bb.0: @@ -1109,13 +1079,6 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1171,51 +1134,51 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f16: ; GFX10: ; %bb.0: @@ -1294,13 +1257,6 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1356,51 +1312,51 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f16__nsz: ; GFX10: ; %bb.0: @@ -1479,13 +1435,6 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1561,83 +1510,83 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v8f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v8, v3, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX9-NEXT: v_pk_min_f16 v7, v2, v6 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX9-NEXT: v_pk_min_f16 v6, v1, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX9-NEXT: v_pk_min_f16 v5, v0, v4 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v6, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v7, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v8, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v8f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v8, v3, v7 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v7, v2, v6 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX940-NEXT: v_perm_b32 v3, v3, v10, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v6, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX940-NEXT: v_perm_b32 v2, v2, v8, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v5, v0, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX940-NEXT: v_perm_b32 v1, v1, v7, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v6, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v8f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v8, v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX900-NEXT: v_pk_min_f16 v7, v2, v6 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX900-NEXT: v_pk_min_f16 v6, v1, v5 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX900-NEXT: v_pk_min_f16 v5, v0, v4 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v7, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v8, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v8f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v8, v3, v7 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v7, v2, v6 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX950-NEXT: v_perm_b32 v3, v3, v10, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v6, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX950-NEXT: v_perm_b32 v2, v2, v8, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v5, v0, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX950-NEXT: v_perm_b32 v1, v1, v7, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v6, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f16: ; GFX10: ; %bb.0: @@ -1818,147 +1767,147 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v16f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v16, v7, v15 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX9-NEXT: v_pk_min_f16 v15, v6, v14 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX9-NEXT: v_pk_min_f16 v14, v5, v13 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX9-NEXT: v_pk_min_f16 v13, v4, v12 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX9-NEXT: v_pk_min_f16 v12, v3, v11 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX9-NEXT: v_pk_min_f16 v11, v2, v10 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX9-NEXT: v_pk_min_f16 v10, v1, v9 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX9-NEXT: v_pk_min_f16 v9, v0, v8 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v10, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v11, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v12, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v13, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v14, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v15, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v16, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v18, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v16f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v16, v7, v15 -; GFX940-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v15, v6, v14 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX940-NEXT: v_perm_b32 v7, v7, v18, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v14, v5, v13 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX940-NEXT: v_perm_b32 v6, v6, v16, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v13, v4, v12 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX940-NEXT: v_perm_b32 v5, v5, v15, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v12, v3, v11 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX940-NEXT: v_perm_b32 v4, v4, v14, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v11, v2, v10 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX940-NEXT: v_perm_b32 v3, v3, v13, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v10, v1, v9 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX940-NEXT: v_perm_b32 v2, v2, v12, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v9, v0, v8 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX940-NEXT: v_perm_b32 v1, v1, v11, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v10, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v16f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v16, v7, v15 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX900-NEXT: v_pk_min_f16 v15, v6, v14 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX900-NEXT: v_pk_min_f16 v14, v5, v13 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX900-NEXT: v_pk_min_f16 v13, v4, v12 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX900-NEXT: v_pk_min_f16 v12, v3, v11 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX900-NEXT: v_pk_min_f16 v11, v2, v10 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX900-NEXT: v_pk_min_f16 v10, v1, v9 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX900-NEXT: v_pk_min_f16 v9, v0, v8 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v10, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v11, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v12, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v13, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v14, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v15, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v16, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v18, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v16f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v16, v7, v15 +; GFX950-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v15, v6, v14 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 +; GFX950-NEXT: v_perm_b32 v7, v7, v18, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v14, v5, v13 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 +; GFX950-NEXT: v_perm_b32 v6, v6, v16, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v13, v4, v12 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 +; GFX950-NEXT: v_perm_b32 v5, v5, v15, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v12, v3, v11 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX950-NEXT: v_perm_b32 v4, v4, v14, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v11, v2, v10 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX950-NEXT: v_perm_b32 v3, v3, v13, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v10, v1, v9 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 +; GFX950-NEXT: v_perm_b32 v2, v2, v12, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v9, v0, v8 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 +; GFX950-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v10, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f16: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 2b3041290b586..2614fb3bf9f73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -26,24 +27,24 @@ define float @v_minimum_f32(float %src0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f32: ; GFX10: ; %bb.0: @@ -94,12 +95,6 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -144,24 +139,24 @@ define float @v_minimum_f32__nsz(float %src0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f32__nsz: ; GFX10: ; %bb.0: @@ -212,12 +207,6 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -264,26 +253,26 @@ define float @v_minimum_f32__nnan_src0(float %arg0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f32__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f32__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f32__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f32__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX950-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f32__nnan_src0: ; GFX10: ; %bb.0: @@ -341,26 +330,26 @@ define float @v_minimum_f32__nnan_src1(float %src0, float %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f32__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f32__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f32__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX900-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f32__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX950-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f32__nnan_src1: ; GFX10: ; %bb.0: @@ -424,32 +413,32 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_min_f32_e32 v1, s16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_min_f32_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_min_f32_e32 v1, s16, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_min_f32_e32 v1, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_f32: ; GFX10: ; %bb.0: @@ -517,31 +506,31 @@ define <2 x float> @v_minimum_v2f32(<2 x float> %src0, <2 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX950-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f32: ; GFX10: ; %bb.0: @@ -601,13 +590,6 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1) ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -660,31 +642,31 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX950-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f32__nsz: ; GFX10: ; %bb.0: @@ -744,13 +726,6 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -813,40 +788,40 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s19 -; GFX9-NEXT: v_min_f32_e32 v1, s17, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_min_f32_e32 v3, s16, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:1] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s3 -; GFX940-NEXT: v_min_f32_e32 v1, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_min_f32_e32 v3, s0, v0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s19 +; GFX900-NEXT: v_min_f32_e32 v1, s17, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX900-NEXT: v_mov_b32_e32 v0, s18 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_min_f32_e32 v3, s16, v0 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_v2f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s3 +; GFX950-NEXT: v_min_f32_e32 v1, s1, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 +; GFX950-NEXT: v_mov_b32_e32 v0, s2 +; GFX950-NEXT: v_min_f32_e32 v3, s0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_v2f32: ; GFX10: ; %bb.0: @@ -927,38 +902,38 @@ define <3 x float> @v_minimum_v3f32(<3 x float> %src0, <3 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX900-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX950-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX950-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f32: ; GFX10: ; %bb.0: @@ -1028,14 +1003,6 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1) ; GFX9-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1097,38 +1064,38 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX900-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX950-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX950-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f32__nsz: ; GFX10: ; %bb.0: @@ -1198,14 +1165,6 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr ; GFX9-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1273,45 +1232,45 @@ define <4 x float> @v_minimum_v4f32(<4 x float> %src0, <4 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v8, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v1, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v2, v6 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v3, v7 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v8, v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: v_min_f32_e32 v4, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v2, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v3, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v3, v7 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX950-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX950-NEXT: v_min_f32_e32 v4, v3, v7 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f32: ; GFX10: ; %bb.0: @@ -1391,15 +1350,6 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1) ; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1469,45 +1419,45 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v8, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v1, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v2, v6 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v3, v7 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v8, v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: v_min_f32_e32 v4, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v2, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v3, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v3, v7 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX950-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX950-NEXT: v_min_f32_e32 v4, v3, v7 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f32__nsz: ; GFX10: ; %bb.0: @@ -1587,15 +1537,6 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr ; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1689,73 +1630,73 @@ define <8 x float> @v_minimum_v8f32(<8 x float> %src0, <8 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v8f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v16, v0, v8 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v1, v9 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v2, v10 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v3, v11 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v4, v12 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v5, v13 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v6, v14 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v7, v15 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v8f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v16, v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX940-NEXT: v_min_f32_e32 v8, v1, v9 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v2, v10 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v3, v11 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v4, v12 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v5, v13 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v6, v14 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v7, v15 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v8f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v16, v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v1, v9 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v2, v10 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v3, v11 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v4, v12 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v5, v13 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v6, v14 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v7, v15 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v8f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v16, v0, v8 +; GFX950-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX950-NEXT: v_min_f32_e32 v8, v1, v9 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v2, v10 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v3, v11 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v4, v12 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v5, v13 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v6, v14 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v7, v15 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f32: ; GFX10: ; %bb.0: @@ -1968,136 +1909,136 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v16f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v31, s30, 0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX9-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX9-NEXT: v_writelane_b32 v31, s31, 1 -; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 -; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 -; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 -; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 -; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 -; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 -; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_min_f32_e32 v19, v14, v30 -; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] -; GFX9-NEXT: v_readlane_b32 s31, v31, 1 -; GFX9-NEXT: v_readlane_b32 s30, v31, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v16f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_mov_b32_e32 v32, 0x7fc00000 -; GFX940-NEXT: v_min_f32_e32 v33, v0, v16 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX940-NEXT: v_min_f32_e32 v34, v1, v17 -; GFX940-NEXT: v_min_f32_e32 v35, v2, v18 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX940-NEXT: v_min_f32_e32 v36, v3, v19 -; GFX940-NEXT: v_min_f32_e32 v37, v4, v20 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX940-NEXT: v_min_f32_e32 v38, v5, v21 -; GFX940-NEXT: v_min_f32_e32 v39, v6, v22 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX940-NEXT: v_min_f32_e32 v48, v7, v23 -; GFX940-NEXT: v_min_f32_e32 v49, v8, v24 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX940-NEXT: v_min_f32_e32 v50, v9, v25 -; GFX940-NEXT: v_min_f32_e32 v51, v10, v26 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX940-NEXT: v_min_f32_e32 v52, v11, v27 -; GFX940-NEXT: v_min_f32_e32 v53, v12, v28 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX940-NEXT: v_min_f32_e32 v54, v13, v29 -; GFX940-NEXT: v_min_f32_e32 v55, v14, v30 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v16, v15, v31 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v16f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v31, s30, 0 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX900-NEXT: v_min_f32_e32 v18, v13, v29 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX900-NEXT: v_writelane_b32 v31, s31, 1 +; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX900-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_min_f32_e32 v19, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX900-NEXT: v_readlane_b32 s31, v31, 1 +; GFX900-NEXT: v_readlane_b32 s30, v31, 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v18, v15, v16 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v16f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_mov_b32_e32 v32, 0x7fc00000 +; GFX950-NEXT: v_min_f32_e32 v33, v0, v16 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 +; GFX950-NEXT: v_min_f32_e32 v34, v1, v17 +; GFX950-NEXT: v_min_f32_e32 v35, v2, v18 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 +; GFX950-NEXT: v_min_f32_e32 v36, v3, v19 +; GFX950-NEXT: v_min_f32_e32 v37, v4, v20 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 +; GFX950-NEXT: v_min_f32_e32 v38, v5, v21 +; GFX950-NEXT: v_min_f32_e32 v39, v6, v22 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 +; GFX950-NEXT: v_min_f32_e32 v48, v7, v23 +; GFX950-NEXT: v_min_f32_e32 v49, v8, v24 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 +; GFX950-NEXT: v_min_f32_e32 v50, v9, v25 +; GFX950-NEXT: v_min_f32_e32 v51, v10, v26 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 +; GFX950-NEXT: v_min_f32_e32 v52, v11, v27 +; GFX950-NEXT: v_min_f32_e32 v53, v12, v28 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 +; GFX950-NEXT: v_min_f32_e32 v54, v13, v29 +; GFX950-NEXT: v_min_f32_e32 v55, v14, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v16, v15, v31 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f32: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 567582c9f58ff..71fdd691a1512 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -28,26 +29,26 @@ define double @v_minimum_f64(double %src0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f64: ; GFX10: ; %bb.0: @@ -100,12 +101,6 @@ define double @v_minimum_f64__nnan(double %src0, double %src1) { ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -152,26 +147,26 @@ define double @v_minimum_f64__nsz(double %src0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f64__nsz: ; GFX10: ; %bb.0: @@ -224,12 +219,6 @@ define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) { ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -278,28 +267,28 @@ define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f64__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f64__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX940-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f64__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX900-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f64__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX950-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f64__nnan_src0: ; GFX10: ; %bb.0: @@ -362,28 +351,28 @@ define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f64__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f64__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX940-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f64__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX900-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f64__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX950-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f64__nnan_src1: ; GFX10: ; %bb.0: @@ -454,35 +443,35 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:1] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s18 +; GFX900-NEXT: v_mov_b32_e32 v1, s19 +; GFX900-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX950-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1] +; GFX950-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_f64: ; GFX10: ; %bb.0: @@ -555,35 +544,35 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f64: ; GFX10: ; %bb.0: @@ -648,13 +637,6 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src ; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -712,35 +694,35 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f64__nsz: ; GFX10: ; %bb.0: @@ -805,13 +787,6 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> ; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -883,46 +858,46 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] -; GFX9-NEXT: v_min_f64 v[0:1], s[16:17], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:3] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_v2f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[18:19] -; GFX940-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GFX940-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s22 +; GFX900-NEXT: v_mov_b32_e32 v4, s20 +; GFX900-NEXT: v_mov_b32_e32 v1, s23 +; GFX900-NEXT: v_mov_b32_e32 v5, s21 +; GFX900-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX900-NEXT: v_min_f64 v[0:1], s[16:17], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] +; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19] +; GFX950-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX950-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_v2f64: ; GFX10: ; %bb.0: @@ -1012,44 +987,44 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc -; GFX940-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX900-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX900-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX950-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX950-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f64: ; GFX10: ; %bb.0: @@ -1125,14 +1100,6 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src ; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1201,44 +1168,44 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc -; GFX940-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX900-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX900-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX950-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX950-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f64__nsz: ; GFX10: ; %bb.0: @@ -1314,14 +1281,6 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> ; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1398,53 +1357,53 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc -; GFX940-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc -; GFX940-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX900-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX900-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX900-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX950-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX950-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX950-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f64: ; GFX10: ; %bb.0: @@ -1532,15 +1491,6 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src ; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] -; GFX940-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1620,53 +1570,53 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc -; GFX940-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc -; GFX940-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX900-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX900-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX900-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX950-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX950-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX950-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f64__nsz: ; GFX10: ; %bb.0: @@ -1754,15 +1704,6 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> ; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] -; GFX940-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1878,89 +1819,89 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v8f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_min_f64 v[32:33], v[2:3], v[18:19] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] -; GFX9-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[16:17] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] -; GFX9-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX9-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX9-NEXT: v_min_f64 v[16:17], v[8:9], v[24:25] -; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX9-NEXT: v_min_f64 v[22:23], v[10:11], v[26:27] -; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX9-NEXT: v_min_f64 v[24:25], v[12:13], v[28:29] -; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_f64 v[18:19], v[14:15], v[30:31] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v8f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_mov_b32_e32 v54, 0x7ff80000 -; GFX940-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX940-NEXT: v_min_f64 v[34:35], v[2:3], v[18:19] -; GFX940-NEXT: v_min_f64 v[36:37], v[4:5], v[20:21] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] -; GFX940-NEXT: v_min_f64 v[38:39], v[6:7], v[22:23] -; GFX940-NEXT: v_min_f64 v[48:49], v[8:9], v[24:25] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] -; GFX940-NEXT: v_min_f64 v[50:51], v[10:11], v[26:27] -; GFX940-NEXT: v_min_f64 v[52:53], v[12:13], v[28:29] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_min_f64 v[16:17], v[14:15], v[30:31] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v8f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: v_min_f64 v[32:33], v[2:3], v[18:19] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX900-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX900-NEXT: v_min_f64 v[2:3], v[0:1], v[16:17] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX900-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX900-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX900-NEXT: v_min_f64 v[16:17], v[8:9], v[24:25] +; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX900-NEXT: v_min_f64 v[22:23], v[10:11], v[26:27] +; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX900-NEXT: v_min_f64 v[24:25], v[12:13], v[28:29] +; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_min_f64 v[18:19], v[14:15], v[30:31] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v8f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_mov_b32_e32 v54, 0x7ff80000 +; GFX950-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] +; GFX950-NEXT: v_min_f64 v[34:35], v[2:3], v[18:19] +; GFX950-NEXT: v_min_f64 v[36:37], v[4:5], v[20:21] +; GFX950-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX950-NEXT: v_min_f64 v[38:39], v[6:7], v[22:23] +; GFX950-NEXT: v_min_f64 v[48:49], v[8:9], v[24:25] +; GFX950-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] +; GFX950-NEXT: v_min_f64 v[50:51], v[10:11], v[26:27] +; GFX950-NEXT: v_min_f64 v[52:53], v[12:13], v[28:29] +; GFX950-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[16:17], v[14:15], v[30:31] +; GFX950-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f64: ; GFX10: ; %bb.0: @@ -2332,295 +2273,295 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v16f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_writelane_b32 v34, s30, 0 -; GFX9-NEXT: v_writelane_b32 v34, s31, 1 -; GFX9-NEXT: v_writelane_b32 v34, s34, 2 -; GFX9-NEXT: v_writelane_b32 v34, s35, 3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] -; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] -; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] -; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] -; GFX9-NEXT: v_min_f64 v[8:9], v[8:9], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] -; GFX9-NEXT: v_min_f64 v[10:11], v[10:11], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] -; GFX9-NEXT: v_min_f64 v[12:13], v[12:13], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] -; GFX9-NEXT: v_min_f64 v[14:15], v[14:15], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] -; GFX9-NEXT: v_min_f64 v[16:17], v[16:17], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] -; GFX9-NEXT: v_min_f64 v[18:19], v[18:19], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] -; GFX9-NEXT: v_min_f64 v[20:21], v[20:21], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] -; GFX9-NEXT: v_min_f64 v[22:23], v[22:23], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] -; GFX9-NEXT: v_min_f64 v[24:25], v[24:25], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] -; GFX9-NEXT: v_min_f64 v[26:27], v[26:27], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] -; GFX9-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] -; GFX9-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] -; GFX9-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX9-NEXT: v_readlane_b32 s35, v34, 3 -; GFX9-NEXT: v_readlane_b32 s34, v34, 2 -; GFX9-NEXT: v_readlane_b32 s31, v34, 1 -; GFX9-NEXT: v_readlane_b32 s30, v34, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v16f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse -; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:16 -; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:12 -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:24 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:20 -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:32 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:28 -; GFX940-NEXT: scratch_load_dword v57, off, s32 offset:8 -; GFX940-NEXT: scratch_load_dword v56, off, s32 offset:4 -; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:40 -; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:36 -; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:48 -; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:44 -; GFX940-NEXT: scratch_load_dword v43, off, s32 offset:56 -; GFX940-NEXT: scratch_load_dword v42, off, s32 offset:52 -; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:64 -; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:60 -; GFX940-NEXT: scratch_load_dword v55, off, s32 offset:72 -; GFX940-NEXT: scratch_load_dword v54, off, s32 offset:68 -; GFX940-NEXT: scratch_load_dword v53, off, s32 offset:80 -; GFX940-NEXT: scratch_load_dword v52, off, s32 offset:76 -; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:88 -; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:84 -; GFX940-NEXT: scratch_load_dword v35, off, s32 offset:96 -; GFX940-NEXT: scratch_load_dword v34, off, s32 offset:92 -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: scratch_load_dword v33, off, s32 offset:104 -; GFX940-NEXT: scratch_load_dword v32, off, s32 offset:100 -; GFX940-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_min_f64 v[58:59], v[2:3], v[36:37] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] -; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:112 -; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:108 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_min_f64 v[60:61], v[4:5], v[38:39] -; GFX940-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:120 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:116 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_min_f64 v[62:63], v[6:7], v[48:49] -; GFX940-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:128 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:124 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_min_f64 v[2:3], v[0:1], v[56:57] -; GFX940-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] -; GFX940-NEXT: v_mov_b32_e32 v0, 0x7ff80000 -; GFX940-NEXT: s_waitcnt vmcnt(23) -; GFX940-NEXT: v_min_f64 v[56:57], v[8:9], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] -; GFX940-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX940-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] -; GFX940-NEXT: s_waitcnt vmcnt(21) -; GFX940-NEXT: v_min_f64 v[46:47], v[10:11], v[44:45] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] -; GFX940-NEXT: s_waitcnt vmcnt(19) -; GFX940-NEXT: v_min_f64 v[44:45], v[12:13], v[42:43] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] -; GFX940-NEXT: s_waitcnt vmcnt(17) -; GFX940-NEXT: v_min_f64 v[42:43], v[14:15], v[40:41] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] -; GFX940-NEXT: s_waitcnt vmcnt(15) -; GFX940-NEXT: v_min_f64 v[40:41], v[16:17], v[54:55] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] -; GFX940-NEXT: s_waitcnt vmcnt(13) -; GFX940-NEXT: v_min_f64 v[54:55], v[18:19], v[52:53] -; GFX940-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] -; GFX940-NEXT: s_waitcnt vmcnt(11) -; GFX940-NEXT: v_min_f64 v[52:53], v[20:21], v[50:51] -; GFX940-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] -; GFX940-NEXT: s_waitcnt vmcnt(9) -; GFX940-NEXT: v_min_f64 v[50:51], v[22:23], v[34:35] -; GFX940-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] -; GFX940-NEXT: s_waitcnt vmcnt(6) -; GFX940-NEXT: v_min_f64 v[34:35], v[24:25], v[32:33] -; GFX940-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] -; GFX940-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc -; GFX940-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse -; GFX940-NEXT: s_waitcnt vmcnt(4) -; GFX940-NEXT: v_min_f64 v[32:33], v[26:27], v[36:37] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc -; GFX940-NEXT: s_waitcnt vmcnt(2) -; GFX940-NEXT: v_min_f64 v[32:33], v[28:29], v[38:39] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_min_f64 v[32:33], v[30:31], v[48:49] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc -; GFX940-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v16f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_writelane_b32 v34, s30, 0 +; GFX900-NEXT: v_writelane_b32 v34, s31, 1 +; GFX900-NEXT: v_writelane_b32 v34, s34, 2 +; GFX900-NEXT: v_writelane_b32 v34, s35, 3 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX900-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX900-NEXT: v_min_f64 v[2:3], v[2:3], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX900-NEXT: v_min_f64 v[4:5], v[4:5], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX900-NEXT: v_min_f64 v[6:7], v[6:7], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX900-NEXT: v_min_f64 v[8:9], v[8:9], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX900-NEXT: v_min_f64 v[10:11], v[10:11], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX900-NEXT: v_min_f64 v[12:13], v[12:13], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX900-NEXT: v_min_f64 v[14:15], v[14:15], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX900-NEXT: v_min_f64 v[16:17], v[16:17], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX900-NEXT: v_min_f64 v[18:19], v[18:19], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX900-NEXT: v_min_f64 v[20:21], v[20:21], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX900-NEXT: v_min_f64 v[22:23], v[22:23], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX900-NEXT: v_min_f64 v[24:25], v[24:25], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX900-NEXT: v_min_f64 v[26:27], v[26:27], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX900-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX900-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] +; GFX900-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX900-NEXT: v_readlane_b32 s35, v34, 3 +; GFX900-NEXT: v_readlane_b32 s34, v34, 2 +; GFX900-NEXT: v_readlane_b32 s31, v34, 1 +; GFX900-NEXT: v_readlane_b32 s30, v34, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v16f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:16 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:12 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:20 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:72 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:68 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:80 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:76 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:88 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:96 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:92 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_min_f64 v[58:59], v[2:3], v[36:37] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:108 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_min_f64 v[60:61], v[4:5], v[38:39] +; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:120 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_min_f64 v[62:63], v[6:7], v[48:49] +; GFX950-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:128 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:124 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[56:57] +; GFX950-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] +; GFX950-NEXT: v_mov_b32_e32 v0, 0x7ff80000 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_min_f64 v[56:57], v[8:9], v[46:47] +; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] +; GFX950-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] +; GFX950-NEXT: s_waitcnt vmcnt(21) +; GFX950-NEXT: v_min_f64 v[46:47], v[10:11], v[44:45] +; GFX950-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] +; GFX950-NEXT: s_waitcnt vmcnt(19) +; GFX950-NEXT: v_min_f64 v[44:45], v[12:13], v[42:43] +; GFX950-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] +; GFX950-NEXT: s_waitcnt vmcnt(17) +; GFX950-NEXT: v_min_f64 v[42:43], v[14:15], v[40:41] +; GFX950-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] +; GFX950-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] +; GFX950-NEXT: s_waitcnt vmcnt(15) +; GFX950-NEXT: v_min_f64 v[40:41], v[16:17], v[54:55] +; GFX950-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] +; GFX950-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] +; GFX950-NEXT: s_waitcnt vmcnt(13) +; GFX950-NEXT: v_min_f64 v[54:55], v[18:19], v[52:53] +; GFX950-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] +; GFX950-NEXT: s_waitcnt vmcnt(11) +; GFX950-NEXT: v_min_f64 v[52:53], v[20:21], v[50:51] +; GFX950-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] +; GFX950-NEXT: s_waitcnt vmcnt(9) +; GFX950-NEXT: v_min_f64 v[50:51], v[22:23], v[34:35] +; GFX950-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_min_f64 v[34:35], v[24:25], v[32:33] +; GFX950-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] +; GFX950-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc +; GFX950-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: v_min_f64 v[32:33], v[26:27], v[36:37] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_min_f64 v[32:33], v[28:29], v[38:39] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[32:33], v[30:31], v[48:49] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f64: ; GFX10: ; %bb.0: diff --git a/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s b/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s index fde3d2057b2ad..d3ca4281dca41 100644 --- a/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s +++ b/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s @@ -1,4 +1,5 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck -check-prefix=GFX940 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX940 %s scratch_load_dword a2, v4, s6 // GFX940: scratch_load_dword a2, v4, s6 ; encoding: [0x00,0x60,0x50,0xdc,0x04,0x00,0x86,0x02] diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s index e208b6cf903d3..e2e84f27b828a 100644 --- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s @@ -1,4 +1,5 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX90A --implicit-check-not=error: %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX10 --implicit-check-not=error: %s diff --git a/llvm/test/MC/AMDGPU/gfx950-unsupported.s b/llvm/test/MC/AMDGPU/gfx950-unsupported.s new file mode 100644 index 0000000000000..f8bbd40b700fd --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950-unsupported.s @@ -0,0 +1,179 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck -check-prefix=ERR %s + +//===----------------------------------------------------------------------===// +// v_mfma_f32_32x32x4_xf32 +//===----------------------------------------------------------------------===// + +v_mfma_f32_32x32x4_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], v[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], a[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], v[0:3], a[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], a[0:3], v[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], v[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], a[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], v[0:3], a[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], a[0:3], v[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + + +//===----------------------------------------------------------------------===// +// v_mfma_f32_16x16x8_xf32 +//===----------------------------------------------------------------------===// + +v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + + +v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt new file mode 100644 index 0000000000000..0697ee8661e76 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx950 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX950 %s + +# GFX950: warning: invalid instruction encoding +0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04 + +# GFX950: warning: invalid instruction encoding +0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04 + +# GFX950: warning: invalid instruction encoding +0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04 + +# GFX950: warning: invalid instruction encoding +0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04 \ No newline at end of file diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt index 9575e50f16312..63e425fdb4ec9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt @@ -1,4 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s # GFX940: global_load_dword v2, v[2:3], off sc0 ; encoding: [0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02] 0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02 diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml index 9c79ea588f624..416419b3a333f 100644 --- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml +++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml @@ -162,6 +162,10 @@ # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX942 %s # RUN: obj2yaml %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX942 %s +# RUN: sed -e 's//64/' -e 's//AMDGCN_GFX950/' %s | yaml2obj -o %t.o.AMDGCN_GFX950 +# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX950 %s +# RUN: obj2yaml %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX950 %s + # RUN: sed -e 's//64/' -e 's//AMDGCN_GFX1010/' %s | yaml2obj -o %t.o.AMDGCN_GFX1010 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1010 %s # RUN: obj2yaml %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1010 %s @@ -411,6 +415,9 @@ # ELF-AMDGCN-GFX942: EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C) # YAML-AMDGCN-GFX942: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX942 ] +# ELF-AMDGCN-GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F) +# YAML-AMDGCN-GFX950: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX950 ] + # ELF-AMDGCN-GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) # YAML-AMDGCN-GFX1010: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1010 ] diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll index 45071ecb75132..8d5307372a303 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll @@ -137,7 +137,6 @@ define amdgpu_kernel void @test_kernel() { ; ----------------------------------GFX9--------------------------------------- ; - ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-4-generic -filetype=obj -O0 -o %t.o %s ; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-4-generic %t.o > %t-specify.txt ; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt @@ -148,6 +147,11 @@ define amdgpu_kernel void @test_kernel() { ; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt ; RUN: diff %t-specify.txt %t-detect.txt +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=obj -O0 -o %t.o %s +; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx950 %t.o > %t-specify.txt +; RUN: llvm-objdump -D %t.o > %t-detect.txt +; RUN: diff %t-specify.txt %t-detect.txt + ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj -O0 -o %t.o %s ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx942 %t.o > %t-specify.txt ; RUN: llvm-objdump -D %t.o > %t-detect.txt diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test index 34c22dca3aa18..7de64a6edfe2e 100644 --- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test +++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test @@ -223,6 +223,15 @@ # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX942 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX942 -DFLAG_VALUE=0x4C +# RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F + +# RUN: yaml2obj %s -o %t -DABI_VERSION=1 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=1 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F + +# RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F + # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1010 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1010 -DFLAG_VALUE=0x33 diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 1012cd020d525..bb8ec41d87454 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1619,6 +1619,7 @@ const EnumEntry ElfHeaderMipsFlags[] = { ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"), \ + ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX950, "gfx950"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1011, "gfx1011"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1012, "gfx1012"), \ diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index 96cb79b7d071c..c76ad018ab4fe 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -43,7 +43,7 @@ set(include_directory ${devicertl_base_directory}/include) set(source_directory ${devicertl_base_directory}/src) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" - "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx1010" + "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx950;gfx1010" "gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035" "gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150" "gfx1151;gfx1152;gfx1153") From cab732861c4885b714c70f2945de9f1dd4d725fa Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 18 Nov 2024 10:44:55 -0800 Subject: [PATCH 018/366] AMDGPU: Add subtarget features for minimum3/maximum3 instructions (#116308) gfx12 and gfx950 managed to produce 3 different permutations of this feature. gfx12 supports f32 and f16, and gfx950 supports f32 and v2f16. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 22 ++++++++++++++++++++++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 11 ++++++++++- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 ++-- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d028c1f5ca761..35dbf86b7c6f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -137,6 +137,18 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureMinimum3Maximum3F32 : SubtargetFeature<"minimum3-maximum3-f32", + "HasMinimum3Maximum3F32", + "true", + "Has v_minimum3_f32 and v_maximum3_f32 instructions" +>; + +def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16", + "HasMinimum3Maximum3F16", + "true", + "Has v_minimum3_f16 and v_maximum3_f16 instructions" +>; + def FeatureSupportsXNACK : SubtargetFeature<"xnack-support", "SupportsXNACK", "true", @@ -1263,6 +1275,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureUnalignedDSAccess, FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, + FeatureMinimum3Maximum3F32, FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics ] >; @@ -2005,6 +2018,15 @@ def isGFX12Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">, AssemblerPredicate<(all_of FeatureGFX12Insts)>; +def HasMinimum3Maximum3F32 : + Predicate<"Subtarget->hasMinimum3Maximum3F32()">, + AssemblerPredicate<(all_of FeatureMinimum3Maximum3F32)>; + +def HasMinimum3Maximum3F16 : + Predicate<"Subtarget->hasMinimum3Maximum3F16()">, + AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>; + + def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<(all_of FeatureFlatAddressSpace)>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 1b06756a8a101..2e7a06a15bd52 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -242,7 +242,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; - + bool HasMinimum3Maximum3F32 = false; + bool HasMinimum3Maximum3F16 = false; bool RequiresCOV6 = false; // Dummy feature to use for assembler in tablegen. @@ -1307,6 +1308,14 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the target has instructions with xf32 format support. bool hasXF32Insts() const { return HasXF32Insts; } + bool hasMinimum3Maximum3F32() const { + return HasMinimum3Maximum3F32; + } + + bool hasMinimum3Maximum3F16() const { + return HasMinimum3Maximum3F16; + } + /// \returns The maximum number of instructions that can be enclosed in an /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that /// instruction. diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 34ecdb56e8689..551e8b3a67920 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -226,7 +226,7 @@ let mayRaiseFPException = 0 in { defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile, AMDGPUfmed3>; } // End mayRaiseFPException = 0 -let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { +let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in { defm V_MINIMUM3_F32 : VOP3Inst <"v_minimum3_f32", VOP3_Profile, AMDGPUfminimum3>; defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 @@ -625,7 +625,7 @@ defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile, AMDGPUsmax3>; defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile, AMDGPUumax3>; -let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { +let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile, AMDGPUfminimum3>; defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 From 5a556d55fb753d7e6e7a310a3fc0f7e83f8f9144 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 18 Nov 2024 10:48:56 -0800 Subject: [PATCH 019/366] AMDGPU: Increase the LDS size to support to 160 KB for gfx950 (#116309) --- llvm/docs/AMDGPUUsage.rst | 2 + llvm/lib/Target/AMDGPU/AMDGPU.td | 17 ++++-- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 12 +++-- llvm/lib/Target/AMDGPU/AMDGPUFeatures.td | 1 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 + llvm/test/CodeGen/AMDGPU/extra-lds-size.ll | 7 +++ .../CodeGen/AMDGPU/lds-limit-diagnostics.ll | 32 ++++++++++++ .../CodeGen/AMDGPU/lds-size-hsa-gfx950.ll | 31 +++++++++++ .../CodeGen/AMDGPU/lds-size-pal-gfx950.ll | 26 ++++++++++ .../tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s | 52 +++++++++++++++++++ 10 files changed, 173 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll create mode 100644 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index b85b680b9c82d..a25b6feddbedd 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5475,6 +5475,8 @@ The fields used by CP for code objects before V3 also match those specified in roundup(lds-size / (64 * 4)) GFX7-GFX11 roundup(lds-size / (128 * 4)) + GFX950 + roundup(lds-size / (320 * 4)) 24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution _INVALID_OPERATION with specified exceptions diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 35dbf86b7c6f3..e84fdf54866cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1192,7 +1192,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", "gfx9", - [FeatureFP64, FeatureAddressableLocalMemorySize65536, + [FeatureFP64, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, @@ -1358,6 +1358,7 @@ def FeatureISAVersion8_1_0 : FeatureSet< def FeatureISAVersion9_0_Common : FeatureSet< [FeatureGFX9, + FeatureAddressableLocalMemorySize65536, FeatureLDSBankCount32, FeatureImageInsts, FeatureMadMacF32Insts]>; @@ -1375,7 +1376,8 @@ def FeatureISAVersion9_Generic : FeatureSet< def FeatureISAVersion9_0_MI_Common : FeatureSet< !listconcat(FeatureISAVersion9_0_Common.Features, - [FeatureFmaMixInsts, + [FeatureAddressableLocalMemorySize65536, + FeatureFmaMixInsts, FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, @@ -1491,15 +1493,17 @@ def FeatureISAVersion9_4_Common : FeatureSet< def FeatureISAVersion9_5_Common : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, - [FeatureFP8Insts, + [FeatureAddressableLocalMemorySize163840, + FeatureFP8Insts, FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug, - FeatureGFX950Insts + FeatureGFX950Insts, ])>; def FeatureISAVersion9_4_0 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [ + FeatureAddressableLocalMemorySize65536, FeatureForceStoreSC0SC1, FeatureFP8Insts, FeatureFP8ConversionInsts, @@ -1510,6 +1514,7 @@ def FeatureISAVersion9_4_0 : FeatureSet< def FeatureISAVersion9_4_1 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [ + FeatureAddressableLocalMemorySize65536, FeatureForceStoreSC0SC1, FeatureFP8Insts, FeatureFP8ConversionInsts, @@ -1520,6 +1525,7 @@ def FeatureISAVersion9_4_1 : FeatureSet< def FeatureISAVersion9_4_2 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [ + FeatureAddressableLocalMemorySize65536, FeatureFP8Insts, FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug, @@ -1528,7 +1534,8 @@ def FeatureISAVersion9_4_2 : FeatureSet< def FeatureISAVersion9_4_Generic : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, - [FeatureRequiresCOV6])>; + [FeatureAddressableLocalMemorySize65536, + FeatureRequiresCOV6])>; def FeatureISAVersion9_5_0 : FeatureSet; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index d801f2b159127..90ece275412c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1172,12 +1172,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; - if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - // LDS is allocated in 64 dword blocks. - LDSAlignShift = 8; - } else { + if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { + // LDS is allocated in 320 dword blocks. + LDSAlignShift = 11; + } else if (STM.getFeatureBits().test( + FeatureAddressableLocalMemorySize65536)) { // LDS is allocated in 128 dword blocks. LDSAlignShift = 9; + } else { + // LDS is allocated in 64 dword blocks. + LDSAlignShift = 8; } ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td index f832a2a55d622..74d1faeb6f545 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -29,6 +29,7 @@ class SubtargetFeatureAddressableLocalMemorySize : SubtargetFeature< def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>; def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>; +def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>; class SubtargetFeatureWavefrontSize : SubtargetFeature< "wavefrontsize"#!shl(1, ValueLog2), diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 01866fbd9da6e..501d00b1f308d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -916,6 +916,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) { return 32768; if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536)) return 65536; + if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) + return 163840; return 0; } diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll index 13640b74a7937..318ecd16a2ccb 100644 --- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll +++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll @@ -2,6 +2,8 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-MESA %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-PAL %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-PAL %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s @@ -17,6 +19,11 @@ ; GFX11-MESA: .long 45100 ; GFX11-MESA-NEXT: .long 1024 +; GFX950-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200 + +; GFX950-MESA: .long 45100 +; GFX950-MESA-NEXT: .long 512 + ; GFX1200-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400 ; GFX1200-MESA: .long 45100 diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll new file mode 100644 index 0000000000000..73f6dcb3a2a1d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll @@ -0,0 +1,32 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT160K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx941 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s +; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT32K %s + +; gfx950 supports upto 160 KB LDS memory. The generic target does not. +; This is a negative test to check when the LDS size exceeds the max usable limit. + +; ERROR-LIMIT160K: error: :0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit' +; ERROR-LIMIT64K: error: :0:0: local memory (163844) exceeds limit (65536) in function 'test_lds_limit' +; ERROR-LIMIT32K: error: :0:0: local memory (163844) exceeds limit (32768) in function 'test_lds_limit' +@dst = addrspace(3) global [40961 x i32] poison + +define amdgpu_kernel void @test_lds_limit(i32 %val) { + %gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100 + store i32 %val, ptr addrspace(3) %gep + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll new file mode 100644 index 0000000000000..6ebfc9a5e9d4f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=MESA %s + +; gfx950 supports upto 160 KB configurable LDS memory. +; This test checks the max and above the old i.e. 128 KiB size of LDS that can be allocated. + +@lds.i32 = addrspace(3) global i32 poison +@lds.array.size.131076 = addrspace(3) global [32768 x i32] poison +@lds.array.size.163840 = addrspace(3) global [40959 x i32] poison + +; GCN-LABEL: test_lds_array_size_131076: +; GCN: .amdhsa_group_segment_fixed_size 131076 +; GCN: ; LDSByteSize: 131076 bytes/workgroup +; MESA: granulated_lds_size = 65 +define amdgpu_kernel void @test_lds_array_size_131076() { + %gep = getelementptr inbounds [32768 x i32], ptr addrspace(3) @lds.array.size.131076, i32 0, i32 20 + %val = load i32, ptr addrspace(3) %gep + store i32 %val, ptr addrspace(3) @lds.i32 + ret void +} + +; GCN-LABEL: test_lds_array_size_163840: +; GCN: .amdhsa_group_segment_fixed_size 163840 +; GCN: ; LDSByteSize: 163840 bytes/workgroup +; MESA: granulated_lds_size = 80 +define amdgpu_kernel void @test_lds_array_size_163840() { + %gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.size.163840 , i32 0, i32 20 + %val = load i32, ptr addrspace(3) %gep + store i32 %val, ptr addrspace(3) @lds.i32 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll new file mode 100644 index 0000000000000..22cad8ab5f536 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=PAL %s + +; GFX950supports upto 160 KB configurable LDS memory. +; This test checks the min and max size of LDS that can be allocated. + +; PAL: .shader_functions: +; PAL: test_lds_array_i32: +; PAL: .lds_size: 0x28000 +; PAL: test_lds_i32: +; PAL: .lds_size: 0x4 + + +@lds.i32 = addrspace(3) global i32 poison +@lds.array.i32 = addrspace(3) global [40959 x i32] poison + +define amdgpu_gfx void @test_lds_i32(i32 %val) { + store i32 %val, ptr addrspace(3) @lds.i32 + ret void +} + +define amdgpu_gfx void @test_lds_array_i32() { + %gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20 + %val = load i32, ptr addrspace(3) %gep + store i32 %val, ptr addrspace(3) @lds.i32 + ret void +} \ No newline at end of file diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s new file mode 100644 index 0000000000000..5b9d42c7fad55 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s @@ -0,0 +1,52 @@ +;; Test disassembly for gfx950 kernel descriptor. + +; RUN: rm -rf %t && split-file %s %t && cd %t + +;--- 1.s +; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1.s > 1.o +; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s +; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1-disasm.s > 1-disasm.o +; FIxMe: cmp 1.o 1-disasm.o +; CHECK: .amdhsa_kernel kernel +; CHECK-NEXT: .amdhsa_group_segment_fixed_size 163840 +; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 +; CHECK-NEXT: .amdhsa_kernarg_size 0 +; CHECK-NEXT: .amdhsa_accum_offset 4 +; CHECK-NEXT: .amdhsa_tg_split 0 +; CHECK-NEXT: .amdhsa_next_free_vgpr 8 +; CHECK-NEXT: .amdhsa_reserve_vcc 0 +; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0 +; CHECK-NEXT: .amdhsa_next_free_sgpr 8 +; CHECK-NEXT: .amdhsa_float_round_mode_32 0 +; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0 +; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0 +; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; CHECK-NEXT: .amdhsa_dx10_clamp 1 +; CHECK-NEXT: .amdhsa_ieee_mode 1 +; CHECK-NEXT: .amdhsa_fp16_overflow 0 +; CHECK-NEXT: .amdhsa_enable_private_segment 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; CHECK-NEXT: .amdhsa_exception_int_div_zero 0 +; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 +; CHECK-NEXT:.end_amdhsa_kernel +.amdhsa_kernel kernel + .amdhsa_group_segment_fixed_size 163840 + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 + .amdhsa_accum_offset 4 +.end_amdhsa_kernel From ca1b35a6c80d7075f4058c642d8c015e4fc8d304 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 18 Nov 2024 10:54:54 -0800 Subject: [PATCH 020/366] AMDGPU: Add v_prng_b32 instruction for gfx950 (#116310) Rand num instruction for stochastic rounding. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 2 +- .../builtins-amdgcn-gfx950-err.cl | 16 ++++++ .../CodeGenOpenCL/builtins-amdgcn-gfx950.cl | 21 +++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 ++ llvm/lib/Target/AMDGPU/AMDGPU.td | 10 ++++ .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 6 ++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 +- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 5 ++ llvm/lib/TargetParser/TargetParser.cpp | 1 + llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll | 32 +++++++++++ llvm/test/MC/AMDGPU/gfx950_asm_vop1.s | 57 +++++++++++++++++++ llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s | 31 ++++++++++ .../Disassembler/AMDGPU/gfx950_dasm_vop1.txt | 43 ++++++++++++++ .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 18 ++++++ 16 files changed, 251 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_vop1.s create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8f44afa405938..61516eb2a4a72 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -522,5 +522,7 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_prng_b32, "UiUi", "nc", "prng-inst") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 5c324032b5195..61cbf5e65d0d2 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -89,7 +89,7 @@ // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl new file mode 100644 index 0000000000000..86f4f73c81c0f --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx906 -emit-llvm \ +// RUN: -verify -o - %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -emit-llvm \ +// RUN: -verify -o - %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 -emit-llvm \ +// RUN: -verify -o - %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm \ +// RUN: -verify -o - %s + + +// REQUIRES: amdgpu-registered-target + +typedef unsigned int uint; +void test_prng_b32(global uint* out, uint a) { + *out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}} +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl new file mode 100644 index 0000000000000..f31ba85a52a7a --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -0,0 +1,21 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL1.2 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int uint; + +// CHECK-LABEL: @test_prng_b32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 +// CHECK-NEXT: ret void +// +void test_prng_b32(global uint* out, uint a) { + *out = __builtin_amdgcn_prng_b32(a); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 4829453ee57cd..ed73f0a69e613 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -594,6 +594,10 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; +def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic< + [llvm_i32_ty], [llvm_i32_ty], [IntrNoMem] +>, ClangBuiltin<"__builtin_amdgcn_prng_b32">; + } // TargetPrefix = "amdgcn" // New-style image intrinsics diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index e84fdf54866cd..09f8dde07b740 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -978,6 +978,12 @@ def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order", "VMEM instructions of the same type write VGPR results in order" >; +def FeaturePrngInst : SubtargetFeature<"prng-inst", + "HasPrngInst", + "true", + "Has v_prng_b32 instruction" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -1498,6 +1504,7 @@ def FeatureISAVersion9_5_Common : FeatureSet< FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug, FeatureGFX950Insts, + FeaturePrngInst ])>; def FeatureISAVersion9_4_0 : FeatureSet< @@ -2350,6 +2357,9 @@ def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">, def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>; +def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">, + AssemblerPredicate<(all_of FeaturePrngInst)>; + def HasGDS : Predicate<"Subtarget->hasGDS()">; def HasGWS : Predicate<"Subtarget->hasGWS()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 8beb9defee66a..28d215e7b3de9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1253,6 +1253,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::amdgcn_prng_b32: { + auto *Src = II.getArgOperand(0); + if (isa(Src)) { + return IC.replaceInstUsesWith(II, Src); + } + } } if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 415c068367074..03e57db9c11ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4515,6 +4515,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pk_u8_f32: case Intrinsic::amdgcn_alignbyte: case Intrinsic::amdgcn_perm: + case Intrinsic::amdgcn_prng_b32: case Intrinsic::amdgcn_fdot2: case Intrinsic::amdgcn_sdot2: case Intrinsic::amdgcn_udot2: diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 2e7a06a15bd52..e722e046092fd 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -220,7 +220,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasSALUFloatInsts = false; bool HasPseudoScalarTrans = false; bool HasRestrictedSOffset = false; - + bool HasPrngInst = false; bool HasVcmpxPermlaneHazard = false; bool HasVMEMtoScalarWriteHazard = false; bool HasSMEMtoVectorWriteHazard = false; @@ -1321,6 +1321,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// instruction. unsigned maxHardClauseLength() const { return MaxHardClauseLength; } + bool hasPrngInst() const { return HasPrngInst; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index f7a66a0820939..e99f562688926 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -761,6 +761,9 @@ let SubtargetPredicate = isGFX11Plus in { defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>; } // End SubtargetPredicate = isGFX11Plus +let SubtargetPredicate = HasPrngInst in +defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>; + foreach vt = Reg32Types.types in { def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)), (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0))) @@ -1516,6 +1519,8 @@ defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>; defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>; defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>; +defm V_PRNG_B32 : VOP1_Real_gfx9 <0x58>; + class MovDPP8Pattern : GCNPat < (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)), (Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> { diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index b0385915f3042..b236e26f495df 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -470,6 +470,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gws"] = true; break; case GK_GFX950: + Features["prng-inst"] = true; Features["gfx950-insts"] = true; [[fallthrough]]; case GK_GFX942: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll new file mode 100644 index 0000000000000..eeef4eeb65a69 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll @@ -0,0 +1,32 @@ +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.amdgcn.prng.b32(i32) #0 + +; GCN-LABEL: {{^}}prng_b32: +; GCN: v_prng_b32_e32 {{v[0-9]+}}, {{s[0-9]+}} +define amdgpu_kernel void @prng_b32(ptr addrspace(1) %out, i32 %src) #1 { + %prng = call i32 @llvm.amdgcn.prng.b32(i32 %src) #0 + store i32 %prng, ptr addrspace(1) %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}prng_b32_constant_4 +; GCN: v_prng_b32_e32 {{v[0-9]+}}, 4 +define amdgpu_kernel void @prng_b32_constant_4(ptr addrspace(1) %out) #1 { + %prng = call i32 @llvm.amdgcn.prng.b32(i32 4) #0 + store i32 %prng, ptr addrspace(1) %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}prng_b32_constant_100 +; GCN: v_prng_b32_e32 {{v[0-9]+}}, 0x64 +define amdgpu_kernel void @prng_b32_constant_100(ptr addrspace(1) %out) #1 { + %prng = call i32 @llvm.amdgcn.prng.b32(i32 100) #0 + store i32 %prng, ptr addrspace(1) %out, align 4 + ret void +} + + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } \ No newline at end of file diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s new file mode 100644 index 0000000000000..0cb292ffe63dd --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s @@ -0,0 +1,57 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s + +v_prng_b32 v5, v1 +// GFX950: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0xb1,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, v255 +// GFX950: v_prng_b32_e32 v5, v255 ; encoding: [0xff,0xb1,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, s1 +// GFX950: v_prng_b32_e32 v5, s1 ; encoding: [0x01,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, s101 +// GFX950: v_prng_b32_e32 v5, s101 ; encoding: [0x65,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, vcc_lo +// GFX950: v_prng_b32_e32 v5, vcc_lo ; encoding: [0x6a,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, vcc_hi +// GFX950: v_prng_b32_e32 v5, vcc_hi ; encoding: [0x6b,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, ttmp15 +// GFX950: v_prng_b32_e32 v5, ttmp15 ; encoding: [0x7b,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, m0 +// GFX950: v_prng_b32_e32 v5, m0 ; encoding: [0x7c,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, exec_lo +// GFX950: v_prng_b32_e32 v5, exec_lo ; encoding: [0x7e,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, exec_hi +// GFX950: v_prng_b32_e32 v5, exec_hi ; encoding: [0x7f,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, -1 +// GFX950: v_prng_b32_e32 v5, -1 ; encoding: [0xc1,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, 0.5 +// GFX950: v_prng_b32_e32 v5, 0.5 ; encoding: [0xf0,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v5, src_scc +// GFX950: v_prng_b32_e32 v5, src_scc ; encoding: [0xfd,0xb0,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_prng_b32 v255, 0xaf123456 +// GFX950: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s new file mode 100644 index 0000000000000..301750689bc78 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s @@ -0,0 +1,31 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefixes=GFX950 %s + +v_prng_b32 v5, v1 quad_perm:[3,2,1,0] +// GFX950: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +v_prng_b32 v5, v1 quad_perm:[0,1,2,3] +// GFX950: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +v_prng_b32 v5, v1 row_mirror +// GFX950: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff] + +v_prng_b32 v5, v1 row_half_mirror +// GFX950: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff] + +v_prng_b32 v5, v1 row_shl:1 +// GFX950: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff] + +v_prng_b32 v5, v1 row_shl:15 +// GFX950: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +v_prng_b32 v5, v1 row_shr:1 +// GFX950: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff] + +v_prng_b32 v5, v1 row_shr:15 +// GFX950: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +v_prng_b32 v5, v1 row_ror:1 +// GFX950: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff] + +v_prng_b32 v5, v1 row_ror:15 +// GFX950: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt new file mode 100644 index 0000000000000..91ab05e99f1e7 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt @@ -0,0 +1,43 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s + +# GFX950: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0xb1,0x0a,0x7e] +0x01,0xb1,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, v255 ; encoding: [0xff,0xb1,0x0a,0x7e] +0xff,0xb1,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, s1 ; encoding: [0x01,0xb0,0x0a,0x7e] +0x01,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, s101 ; encoding: [0x65,0xb0,0x0a,0x7e] +0x65,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, vcc_lo ; encoding: [0x6a,0xb0,0x0a,0x7e] +0x6a,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, vcc_hi ; encoding: [0x6b,0xb0,0x0a,0x7e] +0x6b,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, ttmp15 ; encoding: [0x7b,0xb0,0x0a,0x7e] +0x7b,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, m0 ; encoding: [0x7c,0xb0,0x0a,0x7e] +0x7c,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, exec_lo ; encoding: [0x7e,0xb0,0x0a,0x7e] +0x7e,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, exec_hi ; encoding: [0x7f,0xb0,0x0a,0x7e] +0x7f,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, -1 ; encoding: [0xc1,0xb0,0x0a,0x7e] +0xc1,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, 0.5 ; encoding: [0xf0,0xb0,0x0a,0x7e] +0xf0,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v5, src_scc ; encoding: [0xfd,0xb0,0x0a,0x7e] +0xfd,0xb0,0x0a,0x7e + +# GFX950: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf] +0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf \ No newline at end of file diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 779def76fc58d..5fdb918c87545 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -6547,3 +6547,21 @@ define half @test_constant_fold_exp2_f16_neg_denorm() { %val = call half @llvm.amdgcn.exp2.f16(half 0xH83ff) ret half %val } + +; -------------------------------------------------------------------- +; llvm.amdgcn.prng +; -------------------------------------------------------------------- +declare i32 @llvm.amdgcn.prng.b32(i32) +define i32 @prng_undef_i32() { +; CHECK-LABEL: @prng_undef_i32( +; CHECK-NEXT: ret i32 undef + %prng = call i32 @llvm.amdgcn.prng.b32(i32 undef) + ret i32 %prng +} + +define i32 @prng_poison_i32() { +; CHECK-LABEL: @prng_poison_i32( +; CHECK-NEXT: ret i32 poison + %prng = call i32 @llvm.amdgcn.prng.b32(i32 poison) + ret i32 %prng +} From de5e4ebb5a1b82df5b1d27f423dbad30f872aac6 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Mon, 18 Nov 2024 19:59:47 +0100 Subject: [PATCH 021/366] [libc++] Remove transitive includes from empty headers (#116295) This removes transitive includes that are only in a header that is empty in a given C++ version. --- libcxx/include/bit | 4 - libcxx/include/charconv | 7 -- libcxx/include/compare | 6 - libcxx/include/expected | 6 - libcxx/include/mdspan | 9 -- libcxx/include/memory_resource | 9 -- libcxx/include/ranges | 8 -- .../test/libcxx/transitive_includes/cxx03.csv | 107 ----------------- .../test/libcxx/transitive_includes/cxx11.csv | 107 ----------------- .../test/libcxx/transitive_includes/cxx14.csv | 110 ------------------ .../test/libcxx/transitive_includes/cxx17.csv | 77 ------------ .../test/libcxx/transitive_includes/cxx20.csv | 53 --------- 12 files changed, 503 deletions(-) diff --git a/libcxx/include/bit b/libcxx/include/bit index 94387d101a398..092aebca26a31 100644 --- a/libcxx/include/bit +++ b/libcxx/include/bit @@ -87,10 +87,6 @@ namespace std { # pragma GCC system_header #endif -#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include -#endif - #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/charconv b/libcxx/include/charconv index 8f5e697eec439..a65b3d3527080 100644 --- a/libcxx/include/charconv +++ b/libcxx/include/charconv @@ -101,13 +101,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_END_NAMESPACE_STD -#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 -# include -# include -# include -# include -#endif - #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/compare b/libcxx/include/compare index de0e4c7ec2280..440d4c4b4dd26 100644 --- a/libcxx/include/compare +++ b/libcxx/include/compare @@ -164,12 +164,6 @@ namespace std { # pragma GCC system_header #endif -#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include -# include -# include -#endif - #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/expected b/libcxx/include/expected index 6a2f12f2bf3b5..3c7ef336432a1 100644 --- a/libcxx/include/expected +++ b/libcxx/include/expected @@ -53,10 +53,4 @@ namespace std { # pragma GCC system_header #endif -#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -#endif - #endif // _LIBCPP_EXPECTED diff --git a/libcxx/include/mdspan b/libcxx/include/mdspan index 29190e4a9953e..d6191a197e15c 100644 --- a/libcxx/include/mdspan +++ b/libcxx/include/mdspan @@ -426,13 +426,4 @@ namespace std { # pragma GCC system_header #endif -#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -#endif - #endif // _LIBCPP_MDSPAN diff --git a/libcxx/include/memory_resource b/libcxx/include/memory_resource index e98ca20aa058c..7de69e67b7c06 100644 --- a/libcxx/include/memory_resource +++ b/libcxx/include/memory_resource @@ -66,15 +66,6 @@ namespace std::pmr { # pragma GCC system_header #endif -#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 -# include -# include -# include -# include -# include -# include -#endif - #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include #endif diff --git a/libcxx/include/ranges b/libcxx/include/ranges index b17a399e0ed65..d8ee6f75e8b23 100644 --- a/libcxx/include/ranges +++ b/libcxx/include/ranges @@ -446,14 +446,6 @@ namespace std { # pragma GCC system_header #endif -#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include -# include -# include -# include -# include -#endif - #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv index d70541290023b..72fccfd364932 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx03.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv @@ -242,17 +242,14 @@ ccomplex utility ccomplex variant ccomplex vector ccomplex version -charconv cerrno charconv cmath charconv concepts charconv cstddef charconv cstdint charconv cstdlib charconv cstring -charconv initializer_list charconv iosfwd charconv limits -charconv new charconv type_traits charconv version chrono algorithm @@ -571,12 +568,6 @@ exception typeinfo exception version execution cstddef execution version -expected cstddef -expected cstdint -expected cstdlib -expected initializer_list -expected new -expected type_traits expected version experimental/iterator algorithm experimental/iterator atomic @@ -1340,53 +1331,6 @@ map utility map variant map vector map version -mdspan algorithm -mdspan array -mdspan atomic -mdspan bit -mdspan cctype -mdspan cerrno -mdspan cinttypes -mdspan climits -mdspan clocale -mdspan cmath -mdspan compare -mdspan concepts -mdspan cstdarg -mdspan cstddef -mdspan cstdint -mdspan cstdio -mdspan cstdlib -mdspan cstring -mdspan ctime -mdspan cwchar -mdspan cwctype -mdspan exception -mdspan functional -mdspan initializer_list -mdspan ios -mdspan iosfwd -mdspan iterator -mdspan limits -mdspan locale -mdspan memory -mdspan mutex -mdspan new -mdspan optional -mdspan ratio -mdspan span -mdspan stdexcept -mdspan streambuf -mdspan string -mdspan string_view -mdspan system_error -mdspan tuple -mdspan type_traits -mdspan typeinfo -mdspan unordered_map -mdspan utility -mdspan variant -mdspan vector mdspan version memory atomic memory cctype @@ -1416,42 +1360,15 @@ memory typeinfo memory utility memory variant memory version -memory_resource algorithm -memory_resource atomic -memory_resource bit -memory_resource cctype -memory_resource cerrno -memory_resource climits -memory_resource cmath -memory_resource compare -memory_resource concepts memory_resource cstddef memory_resource cstdint -memory_resource cstdio memory_resource cstdlib -memory_resource cstring -memory_resource ctime -memory_resource cwchar -memory_resource cwctype memory_resource exception -memory_resource initializer_list memory_resource iosfwd -memory_resource iterator -memory_resource limits -memory_resource memory -memory_resource mutex memory_resource new -memory_resource optional -memory_resource ratio memory_resource stdexcept -memory_resource string -memory_resource string_view -memory_resource system_error -memory_resource tuple memory_resource type_traits memory_resource typeinfo -memory_resource utility -memory_resource variant memory_resource version mutex algorithm mutex atomic @@ -1772,52 +1689,28 @@ random utility random variant random vector random version -ranges algorithm -ranges array -ranges atomic -ranges bit ranges cctype -ranges cerrno -ranges climits -ranges clocale ranges cmath ranges compare ranges concepts -ranges cstdarg ranges cstddef ranges cstdint ranges cstdio ranges cstdlib ranges cstring -ranges ctime ranges cwchar ranges cwctype ranges exception -ranges functional ranges initializer_list -ranges ios ranges iosfwd ranges iterator ranges limits -ranges locale -ranges memory -ranges mutex ranges new -ranges optional -ranges ratio -ranges span -ranges stdexcept -ranges streambuf -ranges string -ranges string_view -ranges system_error ranges tuple ranges type_traits ranges typeinfo -ranges unordered_map ranges utility ranges variant -ranges vector ranges version ratio climits ratio cstdint diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv index d70541290023b..72fccfd364932 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx11.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv @@ -242,17 +242,14 @@ ccomplex utility ccomplex variant ccomplex vector ccomplex version -charconv cerrno charconv cmath charconv concepts charconv cstddef charconv cstdint charconv cstdlib charconv cstring -charconv initializer_list charconv iosfwd charconv limits -charconv new charconv type_traits charconv version chrono algorithm @@ -571,12 +568,6 @@ exception typeinfo exception version execution cstddef execution version -expected cstddef -expected cstdint -expected cstdlib -expected initializer_list -expected new -expected type_traits expected version experimental/iterator algorithm experimental/iterator atomic @@ -1340,53 +1331,6 @@ map utility map variant map vector map version -mdspan algorithm -mdspan array -mdspan atomic -mdspan bit -mdspan cctype -mdspan cerrno -mdspan cinttypes -mdspan climits -mdspan clocale -mdspan cmath -mdspan compare -mdspan concepts -mdspan cstdarg -mdspan cstddef -mdspan cstdint -mdspan cstdio -mdspan cstdlib -mdspan cstring -mdspan ctime -mdspan cwchar -mdspan cwctype -mdspan exception -mdspan functional -mdspan initializer_list -mdspan ios -mdspan iosfwd -mdspan iterator -mdspan limits -mdspan locale -mdspan memory -mdspan mutex -mdspan new -mdspan optional -mdspan ratio -mdspan span -mdspan stdexcept -mdspan streambuf -mdspan string -mdspan string_view -mdspan system_error -mdspan tuple -mdspan type_traits -mdspan typeinfo -mdspan unordered_map -mdspan utility -mdspan variant -mdspan vector mdspan version memory atomic memory cctype @@ -1416,42 +1360,15 @@ memory typeinfo memory utility memory variant memory version -memory_resource algorithm -memory_resource atomic -memory_resource bit -memory_resource cctype -memory_resource cerrno -memory_resource climits -memory_resource cmath -memory_resource compare -memory_resource concepts memory_resource cstddef memory_resource cstdint -memory_resource cstdio memory_resource cstdlib -memory_resource cstring -memory_resource ctime -memory_resource cwchar -memory_resource cwctype memory_resource exception -memory_resource initializer_list memory_resource iosfwd -memory_resource iterator -memory_resource limits -memory_resource memory -memory_resource mutex memory_resource new -memory_resource optional -memory_resource ratio memory_resource stdexcept -memory_resource string -memory_resource string_view -memory_resource system_error -memory_resource tuple memory_resource type_traits memory_resource typeinfo -memory_resource utility -memory_resource variant memory_resource version mutex algorithm mutex atomic @@ -1772,52 +1689,28 @@ random utility random variant random vector random version -ranges algorithm -ranges array -ranges atomic -ranges bit ranges cctype -ranges cerrno -ranges climits -ranges clocale ranges cmath ranges compare ranges concepts -ranges cstdarg ranges cstddef ranges cstdint ranges cstdio ranges cstdlib ranges cstring -ranges ctime ranges cwchar ranges cwctype ranges exception -ranges functional ranges initializer_list -ranges ios ranges iosfwd ranges iterator ranges limits -ranges locale -ranges memory -ranges mutex ranges new -ranges optional -ranges ratio -ranges span -ranges stdexcept -ranges streambuf -ranges string -ranges string_view -ranges system_error ranges tuple ranges type_traits ranges typeinfo -ranges unordered_map ranges utility ranges variant -ranges vector ranges version ratio climits ratio cstdint diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv index 90bff887eb278..fd36dace19c76 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx14.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv @@ -247,17 +247,14 @@ ccomplex utility ccomplex variant ccomplex vector ccomplex version -charconv cerrno charconv cmath charconv concepts charconv cstddef charconv cstdint charconv cstdlib charconv cstring -charconv initializer_list charconv iosfwd charconv limits -charconv new charconv type_traits charconv version chrono algorithm @@ -582,12 +579,6 @@ exception typeinfo exception version execution cstddef execution version -expected cstddef -expected cstdint -expected cstdlib -expected initializer_list -expected new -expected type_traits expected version experimental/iterator algorithm experimental/iterator atomic @@ -1370,54 +1361,6 @@ map utility map variant map vector map version -mdspan algorithm -mdspan array -mdspan atomic -mdspan bit -mdspan cctype -mdspan cerrno -mdspan cinttypes -mdspan climits -mdspan clocale -mdspan cmath -mdspan compare -mdspan concepts -mdspan cstdarg -mdspan cstddef -mdspan cstdint -mdspan cstdio -mdspan cstdlib -mdspan cstring -mdspan ctime -mdspan cwchar -mdspan cwctype -mdspan exception -mdspan execution -mdspan functional -mdspan initializer_list -mdspan ios -mdspan iosfwd -mdspan iterator -mdspan limits -mdspan locale -mdspan memory -mdspan mutex -mdspan new -mdspan optional -mdspan ratio -mdspan span -mdspan stdexcept -mdspan streambuf -mdspan string -mdspan string_view -mdspan system_error -mdspan tuple -mdspan type_traits -mdspan typeinfo -mdspan unordered_map -mdspan utility -mdspan variant -mdspan vector mdspan version memory atomic memory cctype @@ -1447,43 +1390,15 @@ memory typeinfo memory utility memory variant memory version -memory_resource algorithm -memory_resource atomic -memory_resource bit -memory_resource cctype -memory_resource cerrno -memory_resource climits -memory_resource cmath -memory_resource compare -memory_resource concepts memory_resource cstddef memory_resource cstdint -memory_resource cstdio memory_resource cstdlib -memory_resource cstring -memory_resource ctime -memory_resource cwchar -memory_resource cwctype memory_resource exception -memory_resource execution -memory_resource initializer_list memory_resource iosfwd -memory_resource iterator -memory_resource limits -memory_resource memory -memory_resource mutex memory_resource new -memory_resource optional -memory_resource ratio memory_resource stdexcept -memory_resource string -memory_resource string_view -memory_resource system_error -memory_resource tuple memory_resource type_traits memory_resource typeinfo -memory_resource utility -memory_resource variant memory_resource version mutex algorithm mutex atomic @@ -1808,53 +1723,28 @@ random utility random variant random vector random version -ranges algorithm -ranges array -ranges atomic -ranges bit ranges cctype -ranges cerrno -ranges climits -ranges clocale ranges cmath ranges compare ranges concepts -ranges cstdarg ranges cstddef ranges cstdint ranges cstdio ranges cstdlib ranges cstring -ranges ctime ranges cwchar ranges cwctype ranges exception -ranges execution -ranges functional ranges initializer_list -ranges ios ranges iosfwd ranges iterator ranges limits -ranges locale -ranges memory -ranges mutex ranges new -ranges optional -ranges ratio -ranges span -ranges stdexcept -ranges streambuf -ranges string -ranges string_view -ranges system_error ranges tuple ranges type_traits ranges typeinfo -ranges unordered_map ranges utility ranges variant -ranges vector ranges version ratio climits ratio cstdint diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv index 2f908e7f78ec1..eaec25f81e582 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx17.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv @@ -571,12 +571,6 @@ exception typeinfo exception version execution cstddef execution version -expected cstddef -expected cstdint -expected cstdlib -expected initializer_list -expected new -expected type_traits expected version experimental/iterator algorithm experimental/iterator atomic @@ -1364,53 +1358,6 @@ map utility map variant map vector map version -mdspan algorithm -mdspan array -mdspan atomic -mdspan bit -mdspan cctype -mdspan cerrno -mdspan cinttypes -mdspan climits -mdspan clocale -mdspan cmath -mdspan compare -mdspan concepts -mdspan cstdarg -mdspan cstddef -mdspan cstdint -mdspan cstdio -mdspan cstdlib -mdspan cstring -mdspan ctime -mdspan cwchar -mdspan cwctype -mdspan exception -mdspan functional -mdspan initializer_list -mdspan ios -mdspan iosfwd -mdspan iterator -mdspan limits -mdspan locale -mdspan memory -mdspan mutex -mdspan new -mdspan optional -mdspan ratio -mdspan span -mdspan stdexcept -mdspan streambuf -mdspan string -mdspan string_view -mdspan system_error -mdspan tuple -mdspan type_traits -mdspan typeinfo -mdspan unordered_map -mdspan utility -mdspan variant -mdspan vector mdspan version memory atomic memory cctype @@ -1796,52 +1743,28 @@ random utility random variant random vector random version -ranges algorithm -ranges array -ranges atomic -ranges bit ranges cctype -ranges cerrno -ranges climits -ranges clocale ranges cmath ranges compare ranges concepts -ranges cstdarg ranges cstddef ranges cstdint ranges cstdio ranges cstdlib ranges cstring -ranges ctime ranges cwchar ranges cwctype ranges exception -ranges functional ranges initializer_list -ranges ios ranges iosfwd ranges iterator ranges limits -ranges locale -ranges memory -ranges mutex ranges new -ranges optional -ranges ratio -ranges span -ranges stdexcept -ranges streambuf -ranges string -ranges string_view -ranges system_error ranges tuple ranges type_traits ranges typeinfo -ranges unordered_map ranges utility ranges variant -ranges vector ranges version ratio climits ratio cstdint diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv index 1a198aa4562fd..89c28e49d6c9d 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx20.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv @@ -567,12 +567,6 @@ exception typeinfo exception version execution cstddef execution version -expected cstddef -expected cstdint -expected cstdlib -expected initializer_list -expected new -expected type_traits expected version experimental/iterator algorithm experimental/iterator atomic @@ -1358,53 +1352,6 @@ map utility map variant map vector map version -mdspan algorithm -mdspan array -mdspan atomic -mdspan bit -mdspan cctype -mdspan cerrno -mdspan cinttypes -mdspan climits -mdspan clocale -mdspan cmath -mdspan compare -mdspan concepts -mdspan cstdarg -mdspan cstddef -mdspan cstdint -mdspan cstdio -mdspan cstdlib -mdspan cstring -mdspan ctime -mdspan cwchar -mdspan cwctype -mdspan exception -mdspan functional -mdspan initializer_list -mdspan ios -mdspan iosfwd -mdspan iterator -mdspan limits -mdspan locale -mdspan memory -mdspan mutex -mdspan new -mdspan optional -mdspan ratio -mdspan span -mdspan stdexcept -mdspan streambuf -mdspan string -mdspan string_view -mdspan system_error -mdspan tuple -mdspan type_traits -mdspan typeinfo -mdspan unordered_map -mdspan utility -mdspan variant -mdspan vector mdspan version memory atomic memory cctype From 486e1d91e30068381f7ef4157361fe35c15abdee Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Mon, 18 Nov 2024 11:01:44 -0800 Subject: [PATCH 022/366] [RISCV][docs] Release Notes These cover recent additions and changes to assembly and inline assembly support. --- llvm/docs/ReleaseNotes.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 03c758c4e7f4b..e9749e591f95c 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -197,6 +197,20 @@ Changes to the RISC-V Backend * The `Sha` extension is now supported. * The RVA23U64, RVA23S64, RVB23U64, and RVB23S64 profiles are no longer marked as experimental. +* `.insn , ` can be used to assemble 48- and 64-bit + instructions from raw integer values. +* `.insn [,] ` now accepts absolute expressions for both + expressions, so that they can be computed from constants and absolute symbols. +* The following new inline assembly constraints and modifiers are accepted: + * `cr` constraint meaning an RVC-encoding compatible GPR (`x8`-`x15`) + * `cf` constraint meaning an RVC-encoding compatible FPR (`f8`-`f15`) + * `R` constraint meaning an even-odd GPR pair (prints as the even register, + but both registers in the pair are considered live). + * `N` modifer meaning print the register encoding (0-31) rather than the name. +* `f` and `cf` inline assembly constraints, when using F-/D-/H-in-X extensions, + will use the relevant GPR rather than FPR. This makes inline assembly portable + between e.g. F and Zfinx code. + Changes to the WebAssembly Backend ---------------------------------- From 85ef9666c892d5e11fce3a0b84e4eaf4603256ee Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Mon, 18 Nov 2024 20:04:05 +0100 Subject: [PATCH 023/366] [libc++] Avoid including all of in (#116541) --- libcxx/include/future | 3 ++- libcxx/test/libcxx/transitive_includes/cxx23.csv | 3 --- libcxx/test/libcxx/transitive_includes/cxx26.csv | 3 --- libcxx/test/std/thread/futures/futures.async/async.pass.cpp | 1 + 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/libcxx/include/future b/libcxx/include/future index 9f7c95e542fd6..cbf3ed9346417 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -384,6 +384,7 @@ template struct uses_allocator, Alloc>; # include <__system_error/error_category.h> # include <__system_error/error_code.h> # include <__system_error/error_condition.h> +# include <__thread/thread.h> # include <__type_traits/add_lvalue_reference.h> # include <__type_traits/aligned_storage.h> # include <__type_traits/conditional.h> @@ -397,7 +398,6 @@ template struct uses_allocator, Alloc>; # include # include # include -# include # include # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -2071,6 +2071,7 @@ _LIBCPP_POP_MACROS # include # include # include +# include #endif #endif // _LIBCPP_FUTURE diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 791aad29710b5..a008b4d76edde 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -445,8 +445,6 @@ functional tuple functional typeinfo functional unordered_map functional version -future array -future atomic future bitset future cctype future cerrno @@ -475,7 +473,6 @@ future stdexcept future streambuf future string future string_view -future thread future tuple future typeinfo future version diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index 78c457a22c31d..d5321da32b3d4 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -444,8 +444,6 @@ functional tuple functional typeinfo functional unordered_map functional version -future array -future atomic future bitset future cctype future cerrno @@ -474,7 +472,6 @@ future stdexcept future streambuf future string future string_view -future thread future tuple future typeinfo future version diff --git a/libcxx/test/std/thread/futures/futures.async/async.pass.cpp b/libcxx/test/std/thread/futures/futures.async/async.pass.cpp index 7e0d82f0d6589..109372b50a311 100644 --- a/libcxx/test/std/thread/futures/futures.async/async.pass.cpp +++ b/libcxx/test/std/thread/futures/futures.async/async.pass.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include "test_macros.h" From 3b8606be547acbc7ae93d943645e6d6c83f66983 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Mon, 18 Nov 2024 11:07:15 -0800 Subject: [PATCH 024/366] Re-land "[SandboxIR] Add debug checker to compare IR before/after a revert (#115968)" (#116671) This PR re-lands https://github.com/llvm/llvm-project/pull/115968 with a fix for a buildbot failure. The `IRSnapshotChecker` class is only defined in debug mode, so its unit tests must also be inside `#ifndef NDEBUG`. --- llvm/include/llvm/SandboxIR/Context.h | 11 ++-- llvm/include/llvm/SandboxIR/Instruction.h | 1 + llvm/include/llvm/SandboxIR/Tracker.h | 66 ++++++++++++++++++-- llvm/lib/SandboxIR/Tracker.cpp | 73 ++++++++++++++++++++++- llvm/unittests/SandboxIR/TrackerTest.cpp | 68 +++++++++++++++++++++ 5 files changed, 209 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index f2056de87cb94..b0d6f8335d9e0 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -44,11 +44,12 @@ class Context { protected: LLVMContext &LLVMCtx; - friend class Type; // For LLVMCtx. - friend class PointerType; // For LLVMCtx. - friend class IntegerType; // For LLVMCtx. - friend class StructType; // For LLVMCtx. - friend class Region; // For LLVMCtx. + friend class Type; // For LLVMCtx. + friend class PointerType; // For LLVMCtx. + friend class IntegerType; // For LLVMCtx. + friend class StructType; // For LLVMCtx. + friend class Region; // For LLVMCtx. + friend class IRSnapshotChecker; // To snapshot LLVMModuleToModuleMap. Tracker IRTracker; diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index d9642365908d2..2a59d72e28552 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -11,6 +11,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/SandboxIR/BasicBlock.h" #include "llvm/SandboxIR/Constant.h" diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index dab20eb809ba0..9a031f3270837 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -42,13 +42,12 @@ #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" -#include "llvm/IR/Module.h" #include "llvm/SandboxIR/Use.h" #include "llvm/Support/Debug.h" #include -#include namespace llvm::sandboxir { @@ -64,9 +63,56 @@ class SwitchInst; class ConstantInt; class ShuffleVectorInst; class CmpInst; -class Module; class GlobalVariable; +#ifndef NDEBUG + +/// A class that saves hashes and textual IR snapshots of functions in a +/// SandboxIR Context, and does hash comparison when `expectNoDiff` is called. +/// If hashes differ, it prints textual IR for both old and new versions to +/// aid debugging. +/// +/// This is used as an additional debug check when reverting changes to +/// SandboxIR, to verify the reverted state matches the initial state. +class IRSnapshotChecker { + Context &Ctx; + + // A snapshot of textual IR for a function, with a hash for quick comparison. + struct FunctionSnapshot { + llvm::stable_hash Hash; + std::string TextualIR; + }; + + // A snapshot for each llvm::Function found in every module in the SandboxIR + // Context. In practice there will always be one module, but sandbox IR + // save/restore ops work at the Context level, so we must take the full state + // into account. + using ContextSnapshot = DenseMap; + + ContextSnapshot OrigContextSnapshot; + + // Dumps to a string the textual IR for a single Function. + std::string dumpIR(const llvm::Function &F) const; + + // Returns a snapshot of all the modules in the sandbox IR context. + ContextSnapshot takeSnapshot() const; + + // Compares two snapshots and returns true if they differ. + bool diff(const ContextSnapshot &Orig, const ContextSnapshot &Curr) const; + +public: + IRSnapshotChecker(Context &Ctx) : Ctx(Ctx) {} + + /// Saves a snapshot of the current state. If there was any previous snapshot, + /// it will be replaced with the new one. + void save(); + + /// Checks current state against saved state, crashes if different. + void expectNoDiff(); +}; + +#endif // NDEBUG + /// The base class for IR Change classes. class IRChangeBase { protected: @@ -405,6 +451,10 @@ class Tracker { TrackerState State = TrackerState::Disabled; Context &Ctx; +#ifndef NDEBUG + IRSnapshotChecker SnapshotChecker; +#endif + public: #ifndef NDEBUG /// Helps catch bugs where we are creating new change objects while in the @@ -412,7 +462,15 @@ class Tracker { bool InMiddleOfCreatingChange = false; #endif // NDEBUG - explicit Tracker(Context &Ctx) : Ctx(Ctx) {} + explicit Tracker(Context &Ctx) + : Ctx(Ctx) +#ifndef NDEBUG + , + SnapshotChecker(Ctx) +#endif + { + } + ~Tracker(); Context &getContext() const { return Ctx; } /// Record \p Change and take ownership. This is the main function used to diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index d35e3ba84990f..27ed37aa9bdd3 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -10,12 +10,75 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/StructuralHash.h" #include "llvm/SandboxIR/Instruction.h" #include using namespace llvm::sandboxir; #ifndef NDEBUG + +std::string IRSnapshotChecker::dumpIR(const llvm::Function &F) const { + std::string Result; + raw_string_ostream SS(Result); + F.print(SS, /*AssemblyAnnotationWriter=*/nullptr); + return Result; +} + +IRSnapshotChecker::ContextSnapshot IRSnapshotChecker::takeSnapshot() const { + ContextSnapshot Result; + for (const auto &Entry : Ctx.LLVMModuleToModuleMap) + for (const auto &F : *Entry.first) { + FunctionSnapshot Snapshot; + Snapshot.Hash = StructuralHash(F, /*DetailedHash=*/true); + Snapshot.TextualIR = dumpIR(F); + Result[&F] = Snapshot; + } + return Result; +} + +bool IRSnapshotChecker::diff(const ContextSnapshot &Orig, + const ContextSnapshot &Curr) const { + bool DifferenceFound = false; + for (const auto &[F, OrigFS] : Orig) { + auto CurrFSIt = Curr.find(F); + if (CurrFSIt == Curr.end()) { + DifferenceFound = true; + dbgs() << "Function " << F->getName() << " not found in current IR.\n"; + dbgs() << OrigFS.TextualIR << "\n"; + continue; + } + const FunctionSnapshot &CurrFS = CurrFSIt->second; + if (OrigFS.Hash != CurrFS.Hash) { + DifferenceFound = true; + dbgs() << "Found IR difference in Function " << F->getName() << "\n"; + dbgs() << "Original:\n" << OrigFS.TextualIR << "\n"; + dbgs() << "Current:\n" << CurrFS.TextualIR << "\n"; + } + } + // Check that Curr doesn't contain any new functions. + for (const auto &[F, CurrFS] : Curr) { + if (!Orig.contains(F)) { + DifferenceFound = true; + dbgs() << "Function " << F->getName() + << " found in current IR but not in original snapshot.\n"; + dbgs() << CurrFS.TextualIR << "\n"; + } + } + return DifferenceFound; +} + +void IRSnapshotChecker::save() { OrigContextSnapshot = takeSnapshot(); } + +void IRSnapshotChecker::expectNoDiff() { + ContextSnapshot CurrContextSnapshot = takeSnapshot(); + if (diff(OrigContextSnapshot, CurrContextSnapshot)) { + llvm_unreachable( + "Original and current IR differ! Probably a checkpointing bug."); + } +} + void UseSet::dump() const { dump(dbgs()); dbgs() << "\n"; @@ -275,7 +338,12 @@ void CmpSwapOperands::dump() const { } #endif -void Tracker::save() { State = TrackerState::Record; } +void Tracker::save() { + State = TrackerState::Record; +#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS) + SnapshotChecker.save(); +#endif +} void Tracker::revert() { assert(State == TrackerState::Record && "Forgot to save()!"); @@ -283,6 +351,9 @@ void Tracker::revert() { for (auto &Change : reverse(Changes)) Change->revert(*this); Changes.clear(); +#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS) + SnapshotChecker.expectNoDiff(); +#endif } void Tracker::accept() { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index 4f2cfa6b06ecd..4eedab124bfa0 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -1844,3 +1844,71 @@ define void @foo(i32 %arg, float %farg) { Ctx.revert(); EXPECT_FALSE(FAdd->getFastMathFlags() != OrigFMF); } + +// IRSnapshotChecker is only defined in debug mode. +#ifndef NDEBUG + +TEST_F(TrackerTest, IRSnapshotCheckerNoChanges) { + parseIR(C, R"IR( +define i32 @foo(i32 %arg) { + %add0 = add i32 %arg, %arg + ret i32 %add0 +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + [[maybe_unused]] auto *F = Ctx.createFunction(&LLVMF); + sandboxir::IRSnapshotChecker Checker(Ctx); + Checker.save(); + Checker.expectNoDiff(); +} + +TEST_F(TrackerTest, IRSnapshotCheckerDiesWithUnexpectedChanges) { + parseIR(C, R"IR( +define i32 @foo(i32 %arg) { + %add0 = add i32 %arg, %arg + %add1 = add i32 %add0, %arg + ret i32 %add1 +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto *F = Ctx.createFunction(&LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + sandboxir::Instruction *Add0 = &*It++; + sandboxir::Instruction *Add1 = &*It++; + sandboxir::IRSnapshotChecker Checker(Ctx); + Checker.save(); + Add1->setOperand(1, Add0); + EXPECT_DEATH(Checker.expectNoDiff(), "Found IR difference"); +} + +TEST_F(TrackerTest, IRSnapshotCheckerSaveMultipleTimes) { + parseIR(C, R"IR( +define i32 @foo(i32 %arg) { + %add0 = add i32 %arg, %arg + %add1 = add i32 %add0, %arg + ret i32 %add1 +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto *F = Ctx.createFunction(&LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + sandboxir::Instruction *Add0 = &*It++; + sandboxir::Instruction *Add1 = &*It++; + sandboxir::IRSnapshotChecker Checker(Ctx); + Checker.save(); + Add1->setOperand(1, Add0); + // Now IR differs from the last snapshot. Let's take a new snapshot. + Checker.save(); + // The new snapshot should have replaced the old one, so this should succeed. + Checker.expectNoDiff(); +} + +#endif // NDEBUG From f14e1a8597f83fa5bbc78befcb7059144d58ff5c Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Mon, 18 Nov 2024 11:08:18 -0800 Subject: [PATCH 025/366] Revert "Add support for reading the dynamic symbol table from PT_DYNAMIC (#112596)" This reverts commit a7b2e73bcaa91255a20f1f2e692bec9eb6c17022. This patch broke the greendragon bot Failed Tests (10): lldb-api :: python_api/sbplatform/TestLocateModuleCallback.py lldb-unit :: Target/./TargetTests/LocateModuleCallbackTest/GetOrCreateModuleCallbackSuccessWithModuleAndSymbol lldb-unit :: Target/./TargetTests/LocateModuleCallbackTest/GetOrCreateModuleCallbackSuccessWithOnlySymbol lldb-unit :: Target/./TargetTests/LocateModuleCallbackTest/GetOrCreateModuleCallbackSuccessWithSymbolAsModule lldb-unit :: Target/./TargetTests/LocateModuleCallbackTest/GetOrCreateModuleCallbackSuccessWithSymbolAsModuleAndSymbol lldb-unit :: Target/./TargetTests/LocateModuleCallbackTest/GetOrCreateModuleCallbackSuccessWithSymbolByPlatformUUID lldb-unit :: Target/./TargetTests/LocateModuleCallbackTest/GetOrCreateModuleWithCachedModuleAndSymbol lldb-unit :: Target/./TargetTests/ModuleCacheTest/GetAndPut lldb-unit :: Target/./TargetTests/ModuleCacheTest/GetAndPutStrangeHostname lldb-unit :: Target/./TargetTests/ModuleCacheTest/GetAndPutUuidExists --- .../Plugins/ObjectFile/ELF/ObjectFileELF.cpp | 182 ++---------------- .../Plugins/ObjectFile/ELF/ObjectFileELF.h | 41 ---- .../test/Shell/ObjectFile/ELF/elf-dynsym.test | 42 ---- 3 files changed, 21 insertions(+), 244 deletions(-) delete mode 100644 lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 8df226817326d..9c7dff8127f47 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -44,7 +44,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/MipsABIFlags.h" -#include "lldb/Target/Process.h" #define CASE_AND_STREAM(s, def, width) \ case def: \ @@ -3008,10 +3007,9 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) { // section, nomatter if .symtab was already parsed or not. This is because // minidebuginfo normally removes the .symtab symbols which have their // matching .dynsym counterparts. - Section *dynsym = nullptr; if (!symtab || GetSectionList()->FindSectionByName(ConstString(".gnu_debugdata"))) { - dynsym = + Section *dynsym = section_list->FindSectionByType(eSectionTypeELFDynamicSymbols, true) .get(); if (dynsym) { @@ -3021,20 +3019,6 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) { m_address_class_map.merge(address_class_map); } } - if (!dynsym) { - // Try and read the dynamic symbol table from the .dynamic section. - uint32_t num_symbols = 0; - std::optional symtab_data = - GetDynsymDataFromDynamic(num_symbols); - std::optional strtab_data = GetDynstrData(); - if (symtab_data && strtab_data) { - auto [num_symbols_parsed, address_class_map] = - ParseSymbols(&lldb_symtab, symbol_id, section_list, num_symbols, - symtab_data.value(), strtab_data.value()); - symbol_id += num_symbols_parsed; - m_address_class_map.merge(address_class_map); - } - } // DT_JMPREL // If present, this entry's d_ptr member holds the address of @@ -3844,33 +3828,6 @@ ObjectFileELF::MapFileDataWritable(const FileSpec &file, uint64_t Size, Offset); } -std::optional -ObjectFileELF::ReadDataFromDynamic(const ELFDynamic *dyn, uint64_t length, - uint64_t offset) { - // ELFDynamic values contain a "d_ptr" member that will be a load address if - // we have an ELF file read from memory, or it will be a file address if it - // was read from a ELF file. This function will correctly fetch data pointed - // to by the ELFDynamic::d_ptr, or return std::nullopt if the data isn't - // available. - const lldb::addr_t d_ptr_addr = dyn->d_ptr + offset; - if (ProcessSP process_sp = m_process_wp.lock()) { - if (DataBufferSP data_sp = ReadMemory(process_sp, d_ptr_addr, length)) - return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize()); - } else { - // We have an ELF file with no section headers or we didn't find the - // .dynamic section. Try and find the .dynstr section. - Address addr; - if (!addr.ResolveAddressUsingFileSections(d_ptr_addr, GetSectionList())) - return std::nullopt; - DataExtractor data; - addr.GetSection()->GetSectionData(data); - return DataExtractor(data, - d_ptr_addr - addr.GetSection()->GetFileAddress(), - length); - } - return std::nullopt; -} - std::optional ObjectFileELF::GetDynstrData() { if (SectionList *section_list = GetSectionList()) { // Find the SHT_DYNAMIC section. @@ -3898,15 +3855,31 @@ std::optional ObjectFileELF::GetDynstrData() { // and represent the dynamic symbol tables's string table. These are needed // by the dynamic loader and we can read them from a process' address space. // - // When loading and ELF file from memory, only the program headers are - // guaranteed end up being mapped into memory, and we can find these values in - // the PT_DYNAMIC segment. + // When loading and ELF file from memory, only the program headers end up + // being mapped into memory, and we can find these values in the PT_DYNAMIC + // segment. const ELFDynamic *strtab = FindDynamicSymbol(DT_STRTAB); const ELFDynamic *strsz = FindDynamicSymbol(DT_STRSZ); if (strtab == nullptr || strsz == nullptr) return std::nullopt; - return ReadDataFromDynamic(strtab, strsz->d_val, /*offset=*/0); + if (ProcessSP process_sp = m_process_wp.lock()) { + if (DataBufferSP data_sp = + ReadMemory(process_sp, strtab->d_ptr, strsz->d_val)) + return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize()); + } else { + // We have an ELF file with no section headers or we didn't find the + // .dynamic section. Try and find the .dynstr section. + Address addr; + if (addr.ResolveAddressUsingFileSections(strtab->d_ptr, GetSectionList())) { + DataExtractor data; + addr.GetSection()->GetSectionData(data); + return DataExtractor(data, + strtab->d_ptr - addr.GetSection()->GetFileAddress(), + strsz->d_val); + } + } + return std::nullopt; } std::optional ObjectFileELF::GetDynamicData() { @@ -3939,116 +3912,3 @@ std::optional ObjectFileELF::GetDynamicData() { } return std::nullopt; } - -std::optional ObjectFileELF::GetNumSymbolsFromDynamicHash() { - const ELFDynamic *hash = FindDynamicSymbol(DT_HASH); - if (hash == nullptr) - return std::nullopt; - - // The DT_HASH header looks like this: - struct DtHashHeader { - uint32_t nbucket; - uint32_t nchain; - }; - if (auto data = ReadDataFromDynamic(hash, 8)) { - // We don't need the number of buckets value "nbucket", we just need the - // "nchain" value which contains the number of symbols. - offset_t offset = offsetof(DtHashHeader, nchain); - return data->GetU32(&offset); - } - - return std::nullopt; -} - -std::optional ObjectFileELF::GetNumSymbolsFromDynamicGnuHash() { - const ELFDynamic *gnu_hash = FindDynamicSymbol(DT_GNU_HASH); - if (gnu_hash == nullptr) - return std::nullopt; - - // Create a DT_GNU_HASH header - // https://flapenguin.me/elf-dt-gnu-hash - struct DtGnuHashHeader { - uint32_t nbuckets = 0; - uint32_t symoffset = 0; - uint32_t bloom_size = 0; - uint32_t bloom_shift = 0; - }; - uint32_t num_symbols = 0; - // Read enogh data for the DT_GNU_HASH header so we can extract the values. - if (auto data = ReadDataFromDynamic(gnu_hash, sizeof(DtGnuHashHeader))) { - offset_t offset = 0; - DtGnuHashHeader header; - header.nbuckets = data->GetU32(&offset); - header.symoffset = data->GetU32(&offset); - header.bloom_size = data->GetU32(&offset); - header.bloom_shift = data->GetU32(&offset); - const size_t addr_size = GetAddressByteSize(); - const addr_t buckets_offset = - sizeof(DtGnuHashHeader) + addr_size * header.bloom_size; - std::vector buckets; - if (auto bucket_data = ReadDataFromDynamic(gnu_hash, header.nbuckets * 4, buckets_offset)) { - offset = 0; - for (uint32_t i = 0; i < header.nbuckets; ++i) - buckets.push_back(bucket_data->GetU32(&offset)); - // Locate the chain that handles the largest index bucket. - uint32_t last_symbol = 0; - for (uint32_t bucket_value : buckets) - last_symbol = std::max(bucket_value, last_symbol); - if (last_symbol < header.symoffset) { - num_symbols = header.symoffset; - } else { - // Walk the bucket's chain to add the chain length to the total. - const addr_t chains_base_offset = buckets_offset + header.nbuckets * 4; - for (;;) { - if (auto chain_entry_data = ReadDataFromDynamic(gnu_hash, 4, chains_base_offset + (last_symbol - header.symoffset) * 4)) { - offset = 0; - uint32_t chain_entry = chain_entry_data->GetU32(&offset); - ++last_symbol; - // If the low bit is set, this entry is the end of the chain. - if (chain_entry & 1) - break; - } else { - break; - } - } - num_symbols = last_symbol; - } - } - } - if (num_symbols > 0) - return num_symbols; - - return std::nullopt; -} - -std::optional -ObjectFileELF::GetDynsymDataFromDynamic(uint32_t &num_symbols) { - // Every ELF file which represents an executable or shared library has - // mandatory .dynamic entries. The DT_SYMTAB value contains a pointer to the - // symbol table, and DT_SYMENT contains the size of a symbol table entry. - // We then can use either the DT_HASH or DT_GNU_HASH to find the number of - // symbols in the symbol table as the symbol count is not stored in the - // .dynamic section as a key/value pair. - // - // When loading and ELF file from memory, only the program headers end up - // being mapped into memory, and we can find these values in the PT_DYNAMIC - // segment. - num_symbols = 0; - // Get the process in case this is an in memory ELF file. - ProcessSP process_sp(m_process_wp.lock()); - const ELFDynamic *symtab = FindDynamicSymbol(DT_SYMTAB); - const ELFDynamic *syment = FindDynamicSymbol(DT_SYMENT); - // DT_SYMTAB and DT_SYMENT are mandatory. - if (symtab == nullptr || syment == nullptr) - return std::nullopt; - - if (std::optional syms = GetNumSymbolsFromDynamicHash()) - num_symbols = *syms; - else if (std::optional syms = GetNumSymbolsFromDynamicGnuHash()) - num_symbols = *syms; - else - return std::nullopt; - if (num_symbols == 0) - return std::nullopt; - return ReadDataFromDynamic(symtab, syment->d_val * num_symbols); -} diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h index 16c216eb81e72..aba3a5bfcbf5b 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h @@ -435,47 +435,6 @@ class ObjectFileELF : public lldb_private::ObjectFile { /// \return The bytes that represent the string table data or \c std::nullopt /// if an error occured. std::optional GetDynstrData(); - - /// Read the bytes pointed to by the \a dyn dynamic entry. - /// - /// ELFDynamic::d_ptr values contain file addresses if we load the ELF file - /// form a file on disk, or they contain load addresses if they were read - /// from memory. This function will correctly extract the data in both cases - /// if it is available. - /// - /// \param[in] dyn The dynamic entry to use to fetch the data from. - /// - /// \param[in] length The number of bytes to read. - /// - /// \param[in] offset The number of bytes to skip after the d_ptr value - /// before reading data. - /// - /// \return The bytes that represent the dynanic entries data or - /// \c std::nullopt if an error occured or the data is not available. - std::optional - ReadDataFromDynamic(const elf::ELFDynamic *dyn, uint64_t length, - uint64_t offset = 0); - - /// Get the bytes that represent the dynamic symbol table from the .dynamic - /// section from process memory. - /// - /// This functon uses the DT_SYMTAB value from the .dynamic section to read - /// the symbols table data from process memory. The number of symbols in the - /// symbol table is calculated by looking at the DT_HASH or DT_GNU_HASH - /// values as the symbol count isn't stored in the .dynamic section. - /// - /// \return The bytes that represent the symbol table data from the .dynamic - /// section or section headers or \c std::nullopt if an error - /// occured or if there is no dynamic symbol data available. - std::optional - GetDynsymDataFromDynamic(uint32_t &num_symbols); - - /// Get the number of symbols from the DT_HASH dynamic entry. - std::optional GetNumSymbolsFromDynamicHash(); - - /// Get the number of symbols from the DT_GNU_HASH dynamic entry. - std::optional GetNumSymbolsFromDynamicGnuHash(); - }; #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_ELF_OBJECTFILEELF_H diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test b/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test deleted file mode 100644 index 7d948e2cd225c..0000000000000 --- a/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test +++ /dev/null @@ -1,42 +0,0 @@ -// This test verifies that loading an ELF file that has no section headers can -// load the dynamic symbol table using the DT_SYMTAB, DT_SYMENT, DT_HASH or -// the DT_GNU_HASH .dynamic key/value pairs that are loaded via the PT_DYNAMIC -// segment. - -// RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj \ -// RUN: -o - - <<<".globl defined, undefined; defined:" | \ -// RUN: ld.lld /dev/stdin -o - --hash-style=gnu -export-dynamic -shared \ -// RUN: -z nosectionheader -o %t.gnu -// RUN: %lldb %t.gnu -b \ -// RUN: -o "image dump objfile" \ -// RUN: | FileCheck %s --dump-input=always --check-prefix=GNU -// GNU: (lldb) image dump objfile -// GNU: Dumping headers for 1 module(s). -// GNU: ObjectFileELF, file = -// GNU: ELF Header -// GNU: e_type = 0x0003 ET_DYN -// Make sure there are no section headers -// GNU: e_shnum = 0x00000000 -// Make sure we were able to load the symbols -// GNU: Symtab, file = {{.*}}elf-dynsym.test.tmp.gnu, num_symbols = 2: -// GNU-DAG: undefined -// GNU-DAG: defined - -// RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj \ -// RUN: -o - - <<<".globl defined, undefined; defined:" | \ -// RUN: ld.lld /dev/stdin -o - --hash-style=sysv -export-dynamic -shared \ -// RUN: -z nosectionheader -o %t.sysv -// RUN: %lldb %t.sysv -b \ -// RUN: -o "image dump objfile" \ -// RUN: | FileCheck %s --dump-input=always --check-prefix=HASH -// HASH: (lldb) image dump objfile -// HASH: Dumping headers for 1 module(s). -// HASH: ObjectFileELF, file = -// HASH: ELF Header -// HASH: e_type = 0x0003 ET_DYN -// Make sure there are no section headers -// HASH: e_shnum = 0x00000000 -// Make sure we were able to load the symbols -// HASH: Symtab, file = {{.*}}elf-dynsym.test.tmp.sysv, num_symbols = 2: -// HASH-DAG: undefined -// HASH-DAG: defined From b769e3544a763a90abefd0dbe9254d83c765e1dc Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Mon, 18 Nov 2024 11:45:41 -0800 Subject: [PATCH 026/366] [clang][serialization] Blobify IMPORTS strings and signatures (#116095) This PR changes a part of the PCM format to store string-like things in the blob attached to a record instead of VBR6-encoding them into the record itself. Applied to the `IMPORTS` section (which is very hot), this speeds up dependency scanning by 2.8%. --- .../include/clang/Serialization/ASTBitCodes.h | 7 +- clang/include/clang/Serialization/ASTReader.h | 14 +- clang/include/clang/Serialization/ASTWriter.h | 4 + clang/lib/Serialization/ASTReader.cpp | 224 ++++++++++-------- clang/lib/Serialization/ASTWriter.cpp | 56 ++++- clang/lib/Serialization/GlobalModuleIndex.cpp | 108 ++++----- 6 files changed, 227 insertions(+), 186 deletions(-) diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 8725d5455ec73..fd834c14ce790 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -44,7 +44,7 @@ namespace serialization { /// Version 4 of AST files also requires that the version control branch and /// revision match exactly, since there is no backward compatibility of /// AST files at this time. -const unsigned VERSION_MAJOR = 33; +const unsigned VERSION_MAJOR = 34; /// AST file minor version number supported by this version of /// Clang. @@ -350,9 +350,8 @@ enum ControlRecordTypes { /// and information about the compiler used to build this AST file. METADATA = 1, - /// Record code for the list of other AST files imported by - /// this AST file. - IMPORTS, + /// Record code for another AST file imported by this AST file. + IMPORT, /// Record code for the original file that was used to /// generate the AST file, including both its file ID and its diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 9c274adc59a20..f739fe688c110 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -2389,11 +2389,8 @@ class ASTReader // Read a string static std::string ReadString(const RecordDataImpl &Record, unsigned &Idx); - - // Skip a string - static void SkipString(const RecordData &Record, unsigned &Idx) { - Idx += Record[Idx] + 1; - } + static StringRef ReadStringBlob(const RecordDataImpl &Record, unsigned &Idx, + StringRef &Blob); // Read a path std::string ReadPath(ModuleFile &F, const RecordData &Record, unsigned &Idx); @@ -2401,11 +2398,8 @@ class ASTReader // Read a path std::string ReadPath(StringRef BaseDirectory, const RecordData &Record, unsigned &Idx); - - // Skip a path - static void SkipPath(const RecordData &Record, unsigned &Idx) { - SkipString(Record, Idx); - } + std::string ReadPathBlob(StringRef BaseDirectory, const RecordData &Record, + unsigned &Idx, StringRef &Blob); /// Read a version tuple. static VersionTuple ReadVersionTuple(const RecordData &Record, unsigned &Idx); diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h index 161b2ef7c86a4..e418fdea44a0a 100644 --- a/clang/include/clang/Serialization/ASTWriter.h +++ b/clang/include/clang/Serialization/ASTWriter.h @@ -769,6 +769,8 @@ class ASTWriter : public ASTDeserializationListener, /// Add a string to the given record. void AddString(StringRef Str, RecordDataImpl &Record); + void AddStringBlob(StringRef Str, RecordDataImpl &Record, + SmallVectorImpl &Blob); /// Convert a path from this build process into one that is appropriate /// for emission in the module file. @@ -776,6 +778,8 @@ class ASTWriter : public ASTDeserializationListener, /// Add a path to the given record. void AddPath(StringRef Path, RecordDataImpl &Record); + void AddPathBlob(StringRef Str, RecordDataImpl &Record, + SmallVectorImpl &Blob); /// Emit the current record with the given path as a blob. void EmitRecordWithPath(unsigned Abbrev, RecordDataRef Record, diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 8b928ede395ae..ec85fad3389a1 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -3092,98 +3092,97 @@ ASTReader::ReadControlBlock(ModuleFile &F, break; } - case IMPORTS: { + case IMPORT: { // Validate the AST before processing any imports (otherwise, untangling // them can be error-prone and expensive). A module will have a name and // will already have been validated, but this catches the PCH case. if (ASTReadResult Result = readUnhashedControlBlockOnce()) return Result; - // Load each of the imported PCH files. - unsigned Idx = 0, N = Record.size(); - while (Idx < N) { - // Read information about the AST file. - ModuleKind ImportedKind = (ModuleKind)Record[Idx++]; - // Whether we're importing a standard c++ module. - bool IsImportingStdCXXModule = Record[Idx++]; - // The import location will be the local one for now; we will adjust - // all import locations of module imports after the global source - // location info are setup, in ReadAST. - auto [ImportLoc, ImportModuleFileIndex] = - ReadUntranslatedSourceLocation(Record[Idx++]); - // The import location must belong to the current module file itself. - assert(ImportModuleFileIndex == 0); - off_t StoredSize = !IsImportingStdCXXModule ? (off_t)Record[Idx++] : 0; - time_t StoredModTime = - !IsImportingStdCXXModule ? (time_t)Record[Idx++] : 0; - - ASTFileSignature StoredSignature; - if (!IsImportingStdCXXModule) { - auto FirstSignatureByte = Record.begin() + Idx; - StoredSignature = ASTFileSignature::create( - FirstSignatureByte, FirstSignatureByte + ASTFileSignature::size); - Idx += ASTFileSignature::size; - } + unsigned Idx = 0; + // Read information about the AST file. + ModuleKind ImportedKind = (ModuleKind)Record[Idx++]; + + // The import location will be the local one for now; we will adjust + // all import locations of module imports after the global source + // location info are setup, in ReadAST. + auto [ImportLoc, ImportModuleFileIndex] = + ReadUntranslatedSourceLocation(Record[Idx++]); + // The import location must belong to the current module file itself. + assert(ImportModuleFileIndex == 0); + + StringRef ImportedName = ReadStringBlob(Record, Idx, Blob); + + bool IsImportingStdCXXModule = Record[Idx++]; + + off_t StoredSize = 0; + time_t StoredModTime = 0; + ASTFileSignature StoredSignature; + std::string ImportedFile; + + // For prebuilt and explicit modules first consult the file map for + // an override. Note that here we don't search prebuilt module + // directories if we're not importing standard c++ module, only the + // explicit name to file mappings. Also, we will still verify the + // size/signature making sure it is essentially the same file but + // perhaps in a different location. + if (ImportedKind == MK_PrebuiltModule || ImportedKind == MK_ExplicitModule) + ImportedFile = PP.getHeaderSearchInfo().getPrebuiltModuleFileName( + ImportedName, /*FileMapOnly*/ !IsImportingStdCXXModule); + + if (IsImportingStdCXXModule && ImportedFile.empty()) { + Diag(diag::err_failed_to_find_module_file) << ImportedName; + return Missing; + } - std::string ImportedName = ReadString(Record, Idx); - std::string ImportedFile; - - // For prebuilt and explicit modules first consult the file map for - // an override. Note that here we don't search prebuilt module - // directories if we're not importing standard c++ module, only the - // explicit name to file mappings. Also, we will still verify the - // size/signature making sure it is essentially the same file but - // perhaps in a different location. - if (ImportedKind == MK_PrebuiltModule || ImportedKind == MK_ExplicitModule) - ImportedFile = PP.getHeaderSearchInfo().getPrebuiltModuleFileName( - ImportedName, /*FileMapOnly*/ !IsImportingStdCXXModule); - - // For C++20 Modules, we won't record the path to the imported modules - // in the BMI - if (!IsImportingStdCXXModule) { - if (ImportedFile.empty()) { - // Use BaseDirectoryAsWritten to ensure we use the same path in the - // ModuleCache as when writing. - ImportedFile = ReadPath(BaseDirectoryAsWritten, Record, Idx); - } else - SkipPath(Record, Idx); - } else if (ImportedFile.empty()) { - Diag(clang::diag::err_failed_to_find_module_file) << ImportedName; - return Missing; - } + if (!IsImportingStdCXXModule) { + StoredSize = (off_t)Record[Idx++]; + StoredModTime = (time_t)Record[Idx++]; - // If our client can't cope with us being out of date, we can't cope with - // our dependency being missing. - unsigned Capabilities = ClientLoadCapabilities; - if ((ClientLoadCapabilities & ARR_OutOfDate) == 0) - Capabilities &= ~ARR_Missing; - - // Load the AST file. - auto Result = ReadASTCore(ImportedFile, ImportedKind, ImportLoc, &F, - Loaded, StoredSize, StoredModTime, - StoredSignature, Capabilities); - - // If we diagnosed a problem, produce a backtrace. - bool recompilingFinalized = - Result == OutOfDate && (Capabilities & ARR_OutOfDate) && - getModuleManager().getModuleCache().isPCMFinal(F.FileName); - if (isDiagnosedResult(Result, Capabilities) || recompilingFinalized) - Diag(diag::note_module_file_imported_by) - << F.FileName << !F.ModuleName.empty() << F.ModuleName; - if (recompilingFinalized) - Diag(diag::note_module_file_conflict); - - switch (Result) { - case Failure: return Failure; - // If we have to ignore the dependency, we'll have to ignore this too. - case Missing: - case OutOfDate: return OutOfDate; - case VersionMismatch: return VersionMismatch; - case ConfigurationMismatch: return ConfigurationMismatch; - case HadErrors: return HadErrors; - case Success: break; + StringRef SignatureBytes = Blob.substr(0, ASTFileSignature::size); + StoredSignature = ASTFileSignature::create(SignatureBytes.begin(), + SignatureBytes.end()); + Blob = Blob.substr(ASTFileSignature::size); + + if (ImportedFile.empty()) { + // Use BaseDirectoryAsWritten to ensure we use the same path in the + // ModuleCache as when writing. + ImportedFile = + ReadPathBlob(BaseDirectoryAsWritten, Record, Idx, Blob); } } + + // If our client can't cope with us being out of date, we can't cope with + // our dependency being missing. + unsigned Capabilities = ClientLoadCapabilities; + if ((ClientLoadCapabilities & ARR_OutOfDate) == 0) + Capabilities &= ~ARR_Missing; + + // Load the AST file. + auto Result = ReadASTCore(ImportedFile, ImportedKind, ImportLoc, &F, + Loaded, StoredSize, StoredModTime, + StoredSignature, Capabilities); + + // If we diagnosed a problem, produce a backtrace. + bool recompilingFinalized = + Result == OutOfDate && (Capabilities & ARR_OutOfDate) && + getModuleManager().getModuleCache().isPCMFinal(F.FileName); + if (isDiagnosedResult(Result, Capabilities) || recompilingFinalized) + Diag(diag::note_module_file_imported_by) + << F.FileName << !F.ModuleName.empty() << F.ModuleName; + if (recompilingFinalized) + Diag(diag::note_module_file_conflict); + + switch (Result) { + case Failure: return Failure; + // If we have to ignore the dependency, we'll have to ignore this too. + case Missing: + case OutOfDate: return OutOfDate; + case VersionMismatch: return VersionMismatch; + case ConfigurationMismatch: return ConfigurationMismatch; + case HadErrors: return HadErrors; + case Success: break; + } break; } @@ -5624,36 +5623,38 @@ bool ASTReader::readASTFileControlBlock( break; } - case IMPORTS: { + case IMPORT: { if (!NeedsImports) break; - unsigned Idx = 0, N = Record.size(); - while (Idx < N) { - // Read information about the AST file. + unsigned Idx = 0; + // Read information about the AST file. + + // Skip Kind + Idx++; - // Skip Kind - Idx++; - bool IsStandardCXXModule = Record[Idx++]; + // Skip ImportLoc + Idx++; - // Skip ImportLoc - Idx++; + StringRef ModuleName = ReadStringBlob(Record, Idx, Blob); - // In C++20 Modules, we don't record the path to imported - // modules in the BMI files. - if (IsStandardCXXModule) { - std::string ModuleName = ReadString(Record, Idx); - Listener.visitImport(ModuleName, /*Filename=*/""); - continue; - } + bool IsStandardCXXModule = Record[Idx++]; - // Skip Size, ModTime and Signature - Idx += 1 + 1 + ASTFileSignature::size; - std::string ModuleName = ReadString(Record, Idx); - std::string FilenameStr = ReadString(Record, Idx); - auto Filename = ResolveImportedPath(PathBuf, FilenameStr, ModuleDir); - Listener.visitImport(ModuleName, *Filename); + // In C++20 Modules, we don't record the path to imported + // modules in the BMI files. + if (IsStandardCXXModule) { + Listener.visitImport(ModuleName, /*Filename=*/""); + continue; } + + // Skip Size and ModTime. + Idx += 1 + 1; + // Skip signature. + Blob = Blob.substr(ASTFileSignature::size); + + StringRef FilenameStr = ReadStringBlob(Record, Idx, Blob); + auto Filename = ResolveImportedPath(PathBuf, FilenameStr, ModuleDir); + Listener.visitImport(ModuleName, *Filename); break; } @@ -9602,6 +9603,14 @@ std::string ASTReader::ReadString(const RecordDataImpl &Record, unsigned &Idx) { return Result; } +StringRef ASTReader::ReadStringBlob(const RecordDataImpl &Record, unsigned &Idx, + StringRef &Blob) { + unsigned Len = Record[Idx++]; + StringRef Result = Blob.substr(0, Len); + Blob = Blob.substr(Len); + return Result; +} + std::string ASTReader::ReadPath(ModuleFile &F, const RecordData &Record, unsigned &Idx) { return ReadPath(F.BaseDirectory, Record, Idx); @@ -9613,6 +9622,13 @@ std::string ASTReader::ReadPath(StringRef BaseDirectory, return ResolveImportedPathAndAllocate(PathBuf, Filename, BaseDirectory); } +std::string ASTReader::ReadPathBlob(StringRef BaseDirectory, + const RecordData &Record, unsigned &Idx, + StringRef &Blob) { + StringRef Filename = ReadStringBlob(Record, Idx, Blob); + return ResolveImportedPathAndAllocate(PathBuf, Filename, BaseDirectory); +} + VersionTuple ASTReader::ReadVersionTuple(const RecordData &Record, unsigned &Idx) { unsigned Major = Record[Idx++]; diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 88b3e649a5d46..a52d59c61c4ce 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -878,7 +878,7 @@ void ASTWriter::WriteBlockInfoBlock() { RECORD(MODULE_NAME); RECORD(MODULE_DIRECTORY); RECORD(MODULE_MAP_FILE); - RECORD(IMPORTS); + RECORD(IMPORT); RECORD(ORIGINAL_FILE); RECORD(ORIGINAL_FILE_ID); RECORD(INPUT_FILE_OFFSETS); @@ -1536,34 +1536,53 @@ void ASTWriter::WriteControlBlock(Preprocessor &PP, StringRef isysroot) { // Imports if (Chain) { - serialization::ModuleManager &Mgr = Chain->getModuleManager(); - Record.clear(); + auto Abbrev = std::make_shared(); + Abbrev->Add(BitCodeAbbrevOp(IMPORT)); + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Kind + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ImportLoc + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Module name len + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Standard C++ mod + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // File size + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // File timestamp + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // File name len + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Strings + unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev)); - for (ModuleFile &M : Mgr) { + SmallString<128> Blob; + + for (ModuleFile &M : Chain->getModuleManager()) { // Skip modules that weren't directly imported. if (!M.isDirectlyImported()) continue; + Record.clear(); + Blob.clear(); + + Record.push_back(IMPORT); Record.push_back((unsigned)M.Kind); // FIXME: Stable encoding - Record.push_back(M.StandardCXXModule); AddSourceLocation(M.ImportLoc, Record); + AddStringBlob(M.ModuleName, Record, Blob); + Record.push_back(M.StandardCXXModule); // We don't want to hard code the information about imported modules // in the C++20 named modules. - if (!M.StandardCXXModule) { + if (M.StandardCXXModule) { + Record.push_back(0); + Record.push_back(0); + Record.push_back(0); + } else { // If we have calculated signature, there is no need to store // the size or timestamp. Record.push_back(M.Signature ? 0 : M.File.getSize()); Record.push_back(M.Signature ? 0 : getTimestampForOutput(M.File)); - llvm::append_range(Record, M.Signature); - } - AddString(M.ModuleName, Record); + llvm::append_range(Blob, M.Signature); - if (!M.StandardCXXModule) - AddPath(M.FileName, Record); + AddPathBlob(M.FileName, Record, Blob); + } + + Stream.EmitRecordWithBlob(AbbrevCode, Record, Blob); } - Stream.EmitRecord(IMPORTS, Record); } // Write the options block. @@ -4777,6 +4796,12 @@ void ASTWriter::AddString(StringRef Str, RecordDataImpl &Record) { Record.insert(Record.end(), Str.begin(), Str.end()); } +void ASTWriter::AddStringBlob(StringRef Str, RecordDataImpl &Record, + SmallVectorImpl &Blob) { + Record.push_back(Str.size()); + Blob.insert(Blob.end(), Str.begin(), Str.end()); +} + bool ASTWriter::PreparePathForOutput(SmallVectorImpl &Path) { assert(WritingAST && "can't prepare path for output when not writing AST"); @@ -4805,6 +4830,13 @@ void ASTWriter::AddPath(StringRef Path, RecordDataImpl &Record) { AddString(FilePath, Record); } +void ASTWriter::AddPathBlob(StringRef Path, RecordDataImpl &Record, + SmallVectorImpl &Blob) { + SmallString<128> FilePath(Path); + PreparePathForOutput(FilePath); + AddStringBlob(FilePath, Record, Blob); +} + void ASTWriter::EmitRecordWithPath(unsigned Abbrev, RecordDataRef Record, StringRef Path) { SmallString<128> FilePath(Path); diff --git a/clang/lib/Serialization/GlobalModuleIndex.cpp b/clang/lib/Serialization/GlobalModuleIndex.cpp index 9c48712a0b3fb..4b920fccecac3 100644 --- a/clang/lib/Serialization/GlobalModuleIndex.cpp +++ b/clang/lib/Serialization/GlobalModuleIndex.cpp @@ -614,62 +614,58 @@ llvm::Error GlobalModuleIndexBuilder::loadModuleFile(FileEntryRef File) { unsigned Code = MaybeCode.get(); // Handle module dependencies. - if (State == ControlBlock && Code == IMPORTS) { - // Load each of the imported PCH files. - unsigned Idx = 0, N = Record.size(); - while (Idx < N) { - // Read information about the AST file. - - // Skip the imported kind - ++Idx; - - // Skip if it is standard C++ module - ++Idx; - - // Skip the import location - ++Idx; - - // Load stored size/modification time. - off_t StoredSize = (off_t)Record[Idx++]; - time_t StoredModTime = (time_t)Record[Idx++]; - - // Skip the stored signature. - // FIXME: we could read the signature out of the import and validate it. - auto FirstSignatureByte = Record.begin() + Idx; - ASTFileSignature StoredSignature = ASTFileSignature::create( - FirstSignatureByte, FirstSignatureByte + ASTFileSignature::size); - Idx += ASTFileSignature::size; - - // Skip the module name (currently this is only used for prebuilt - // modules while here we are only dealing with cached). - Idx += Record[Idx] + 1; - - // Retrieve the imported file name. - unsigned Length = Record[Idx++]; - SmallString<128> ImportedFile(Record.begin() + Idx, - Record.begin() + Idx + Length); - Idx += Length; - - // Find the imported module file. - auto DependsOnFile = - FileMgr.getOptionalFileRef(ImportedFile, /*OpenFile=*/false, - /*CacheFailure=*/false); - - if (!DependsOnFile) - return llvm::createStringError(std::errc::bad_file_descriptor, - "imported file \"%s\" not found", - ImportedFile.c_str()); - - // Save the information in ImportedModuleFileInfo so we can verify after - // loading all pcms. - ImportedModuleFiles.insert(std::make_pair( - *DependsOnFile, ImportedModuleFileInfo(StoredSize, StoredModTime, - StoredSignature))); - - // Record the dependency. - unsigned DependsOnID = getModuleFileInfo(*DependsOnFile).ID; - getModuleFileInfo(File).Dependencies.push_back(DependsOnID); - } + if (State == ControlBlock && Code == IMPORT) { + unsigned Idx = 0; + // Read information about the AST file. + + // Skip the imported kind + ++Idx; + + // Skip the import location + ++Idx; + + // Skip the module name (currently this is only used for prebuilt + // modules while here we are only dealing with cached). + Blob = Blob.substr(Record[Idx++]); + + // Skip if it is standard C++ module + ++Idx; + + // Load stored size/modification time. + off_t StoredSize = (off_t)Record[Idx++]; + time_t StoredModTime = (time_t)Record[Idx++]; + + // Skip the stored signature. + // FIXME: we could read the signature out of the import and validate it. + StringRef SignatureBytes = Blob.substr(0, ASTFileSignature::size); + auto StoredSignature = ASTFileSignature::create(SignatureBytes.begin(), + SignatureBytes.end()); + Blob = Blob.substr(ASTFileSignature::size); + + // Retrieve the imported file name. + unsigned Length = Record[Idx++]; + StringRef ImportedFile = Blob.substr(0, Length); + Blob = Blob.substr(Length); + + // Find the imported module file. + auto DependsOnFile = + FileMgr.getOptionalFileRef(ImportedFile, /*OpenFile=*/false, + /*CacheFailure=*/false); + + if (!DependsOnFile) + return llvm::createStringError(std::errc::bad_file_descriptor, + "imported file \"%s\" not found", + std::string(ImportedFile).c_str()); + + // Save the information in ImportedModuleFileInfo so we can verify after + // loading all pcms. + ImportedModuleFiles.insert(std::make_pair( + *DependsOnFile, ImportedModuleFileInfo(StoredSize, StoredModTime, + StoredSignature))); + + // Record the dependency. + unsigned DependsOnID = getModuleFileInfo(*DependsOnFile).ID; + getModuleFileInfo(File).Dependencies.push_back(DependsOnID); continue; } From 7b525495e8574285c19188be11e7ef8a51382ff3 Mon Sep 17 00:00:00 2001 From: Wael Yehia Date: Mon, 18 Nov 2024 20:03:35 +0000 Subject: [PATCH 027/366] [test][PGO] Use -fprofile-update=atomic instead of mllvm option in ContinuousSyncMode/online-merging.c --- .../test/profile/ContinuousSyncMode/online-merging.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c index 35b0cd0b05d1f..54346487a5c79 100644 --- a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c +++ b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c @@ -8,9 +8,9 @@ // Create two DSOs and a driver program that uses them. // RUN: echo "void dso1(void) {}" > dso1.c // RUN: echo "void dso2(void) {}" > dso2.c -// RUN: %clang_pgogen_cont %shared_lib_flag -o %t.dir/dso1.dylib dso1.c -mllvm -instrprof-atomic-counter-update-all=1 -// RUN: %clang_pgogen_cont %shared_lib_flag -o %t.dir/dso2.dylib dso2.c -mllvm -instrprof-atomic-counter-update-all=1 -// RUN: %clang_pgogen_cont -o main.exe %s %t.dir/dso1.dylib %t.dir/dso2.dylib -mllvm -instrprof-atomic-counter-update-all=1 +// RUN: %clang_pgogen_cont %shared_lib_flag -o %t.dir/dso1.dylib dso1.c -fprofile-update=atomic +// RUN: %clang_pgogen_cont %shared_lib_flag -o %t.dir/dso2.dylib dso2.c -fprofile-update=atomic +// RUN: %clang_pgogen_cont -o main.exe %s %t.dir/dso1.dylib %t.dir/dso2.dylib -fprofile-update=atomic // // === Round 1 === // Test merging+continuous mode without any file contention. From 842fd1537521d38913aec5c9a081afedf97d88fe Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Mon, 18 Nov 2024 12:06:52 -0800 Subject: [PATCH 028/366] [llvm-exegesis] Add explicit support for setting DF in X86 (#115644) While llvm-exegesis has explicit support for setting EFLAGS which contains DF, it can be nice sometimes to explicitly set DF, especially given that it is modeled as a separate register within LLVM. This patch adds the ability to do that by lowering setting the value to 0 or 1 to cld and std respectively. --- llvm/tools/llvm-exegesis/lib/X86/Target.cpp | 13 +++++++++++++ .../tools/llvm-exegesis/X86/TargetTest.cpp | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp index 0a70321fab781..3c3bff76fb681 100644 --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -537,6 +537,8 @@ struct ConstantInliner { std::vector loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value); + std::vector loadDirectionFlagAndFinalize(); + private: ConstantInliner &add(const MCInst &Inst) { Instructions.push_back(Inst); @@ -612,6 +614,15 @@ ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) { return std::move(Instructions); } +std::vector ConstantInliner::loadDirectionFlagAndFinalize() { + if (Constant_.isZero()) + add(MCInstBuilder(X86::CLD)); + else if (Constant_.isOne()) + add(MCInstBuilder(X86::STD)); + + return std::move(Instructions); +} + void ConstantInliner::initStack(unsigned Bytes) { assert(Constant_.getBitWidth() <= Bytes * 8 && "Value does not have the correct size"); @@ -1089,6 +1100,8 @@ std::vector ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI, 0x1f80); if (Reg == X86::FPCW) return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f); + if (Reg == X86::DF) + return CI.loadDirectionFlagAndFinalize(); return {}; // Not yet implemented. } diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp index 921d7d7975f6a..3dff50c44798d 100644 --- a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp +++ b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp @@ -585,6 +585,14 @@ TEST_F(X86Core2TargetTest, SetRegToFP1_4Bits) { OpcodeIs(X86::LD_Fp80m), IsStackDeallocate(10))); } +TEST_F(X86Core2TargetTest, SetRegToDf1) { + EXPECT_THAT(setRegTo(X86::DF, APInt(1, 1)), ElementsAre(OpcodeIs(X86::STD))); +} + +TEST_F(X86Core2TargetTest, SetRegToDf0) { + EXPECT_THAT(setRegTo(X86::DF, APInt(1, 0)), ElementsAre(OpcodeIs(X86::CLD))); +} + TEST_F(X86Core2Avx512TargetTest, FillMemoryOperands_ADD64rm) { const Instruction &I = getInstr(X86::ADD64rm); InstructionTemplate IT(&I); From 3d172f3dff25ce70f7158330ac4068e48e2b364d Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 18 Nov 2024 20:10:01 +0000 Subject: [PATCH 029/366] [Linker] Remove dead code handling recursive types. NFC. (#116652) --- llvm/lib/Linker/IRMover.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 0d54c534590ca..c653900c632cc 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -292,17 +292,9 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet &Visited) { AnyChange |= ElementTypes[I] != Ty->getContainedType(I); } - // If we found our type while recursively processing stuff, just use it. + // Refresh Entry after recursively processing stuff. Entry = &MappedTypes[Ty]; - if (*Entry) { - if (auto *DTy = dyn_cast(*Entry)) { - if (DTy->isOpaque()) { - auto *STy = cast(Ty); - finishType(DTy, STy, ElementTypes); - } - } - return *Entry; - } + assert(!*Entry && "Recursive type!"); // If all of the element types mapped directly over and the type is not // a named struct, then the type is usable as-is. From eac02611048a81bd78e461b651158c3c6557cb74 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 18 Nov 2024 20:11:27 +0000 Subject: [PATCH 030/366] [Linker] Remove a use of StructType::setBody. NFC. (#116653) This falls out naturally after inlining finishType into its only remaining use. --- llvm/lib/Linker/IRMover.cpp | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index c653900c632cc..4bb0ddf891744 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -82,8 +82,6 @@ class TypeMapTy : public ValueMapTypeRemapper { Type *get(Type *SrcTy); Type *get(Type *SrcTy, SmallPtrSet &Visited); - void finishType(StructType *DTy, StructType *STy, ArrayRef ETypes); - FunctionType *get(FunctionType *T) { return cast(get((Type *)T)); } @@ -233,20 +231,6 @@ Error TypeMapTy::linkDefinedTypeBodies() { return Error::success(); } -void TypeMapTy::finishType(StructType *DTy, StructType *STy, - ArrayRef ETypes) { - DTy->setBody(ETypes, STy->isPacked()); - - // Steal STy's name. - if (STy->hasName()) { - SmallString<16> TmpName = STy->getName(); - STy->setName(""); - DTy->setName(TmpName); - } - - DstStructTypesSet.addNonOpaque(DTy); -} - Type *TypeMapTy::get(Type *Ty) { SmallPtrSet Visited; return get(Ty, Visited); @@ -342,8 +326,17 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet &Visited) { return *Entry = Ty; } - StructType *DTy = StructType::create(Ty->getContext()); - finishType(DTy, STy, ElementTypes); + StructType *DTy = + StructType::create(Ty->getContext(), ElementTypes, "", STy->isPacked()); + + // Steal STy's name. + if (STy->hasName()) { + SmallString<16> TmpName = STy->getName(); + STy->setName(""); + DTy->setName(TmpName); + } + + DstStructTypesSet.addNonOpaque(DTy); return *Entry = DTy; } } From 1d0b2851224b1ef97c49faac2c666535f1997363 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= Date: Mon, 18 Nov 2024 21:14:55 +0100 Subject: [PATCH 031/366] [lldb] Relax check for breakpoint site in Unwind/windows-unaligned-x86_64.test (#115318) This test checks the thread backtrace for entries of intermediate frames that aren't aligned to 16 bytes. In order to do that, it sets a single breakpoint and makes sure we stop there. It seems sufficient, however, to check that we hit the breakpoint itself and not which particular site. --- lldb/test/Shell/Unwind/windows-unaligned-x86_64.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test b/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test index 94f1c011ebd2a..0356960424328 100644 --- a/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test +++ b/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test @@ -17,7 +17,7 @@ breakpoint set -n func # CHECK: Breakpoint 1: where = {{.*}}`{{(::)?}}func process launch -# CHECK: stop reason = breakpoint 1.1 +# CHECK: stop reason = breakpoint 1 thread backtrace # CHECK: frame #0: {{.*}}`{{(::)?}}func From ac17b50f50bad5c1cc306e1813322ed2ae6e1ef0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 18 Nov 2024 12:20:34 -0800 Subject: [PATCH 032/366] [RISCV] Use getSignedTargetConstant. NFC --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 92 +++++++++---------- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 +- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 15 ++- .../Target/RISCV/RISCVInstrInfoVPseudos.td | 4 +- llvm/lib/Target/RISCV/RISCVInstrInfoZb.td | 12 +-- 5 files changed, 61 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 034314c88f79f..ca368a18c80d6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -176,8 +176,7 @@ static SDValue selectImmSeq(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, RISCVMatInt::InstSeq &Seq) { SDValue SrcReg = CurDAG->getRegister(RISCV::X0, VT); for (const RISCVMatInt::Inst &Inst : Seq) { - SDValue SDImm = - CurDAG->getSignedConstant(Inst.getImm(), DL, VT, /*isTarget=*/true); + SDValue SDImm = CurDAG->getSignedTargetConstant(Inst.getImm(), DL, VT); SDNode *Result = nullptr; switch (Inst.getOpndKind()) { case RISCVMatInt::Imm: @@ -208,10 +207,10 @@ static SDValue selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, // Use a rematerializable pseudo instruction for short sequences if enabled. if (Seq.size() == 2 && UsePseudoMovImm) - return SDValue(CurDAG->getMachineNode(RISCV::PseudoMovImm, DL, VT, - CurDAG->getSignedConstant( - Imm, DL, VT, /*isTarget=*/true)), - 0); + return SDValue( + CurDAG->getMachineNode(RISCV::PseudoMovImm, DL, VT, + CurDAG->getSignedTargetConstant(Imm, DL, VT)), + 0); // See if we can create this constant as (ADD (SLLI X, C), X) where X is at // worst an LUI+ADDIW. This will require an extra register, but avoids a @@ -594,7 +593,7 @@ bool RISCVDAGToDAGISel::tryShrinkShlLogicImm(SDNode *Node) { SDNode *BinOp = CurDAG->getMachineNode( BinOpc, DL, VT, Shift.getOperand(0), - CurDAG->getSignedConstant(ShiftedVal, DL, VT, /*isTarget=*/true)); + CurDAG->getSignedTargetConstant(ShiftedVal, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(ShOpc, DL, VT, SDValue(BinOp, 0), CurDAG->getTargetConstant(ShAmt, DL, VT)); @@ -723,11 +722,10 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) { return false; EVT Ty = Ld->getOffset().getValueType(); - SDValue Ops[] = {Ld->getBasePtr(), - CurDAG->getSignedConstant(Offset >> Shift, SDLoc(Node), Ty, - /*isTarget=*/true), - CurDAG->getTargetConstant(Shift, SDLoc(Node), Ty), - Ld->getChain()}; + SDValue Ops[] = { + Ld->getBasePtr(), + CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(Node), Ty), + CurDAG->getTargetConstant(Shift, SDLoc(Node), Ty), Ld->getChain()}; SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(Node), Ld->getValueType(0), Ld->getValueType(1), MVT::Other, Ops); @@ -2515,8 +2513,8 @@ bool RISCVDAGToDAGISel::SelectFrameAddrRegImm(SDValue Addr, SDValue &Base, if (isInt<12>(CVal)) { Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT()); - Offset = CurDAG->getSignedConstant( - CVal, SDLoc(Addr), Subtarget->getXLenVT(), /*isTarget=*/true); + Offset = CurDAG->getSignedTargetConstant(CVal, SDLoc(Addr), + Subtarget->getXLenVT()); return true; } } @@ -2555,7 +2553,7 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, } else { Base = CurDAG->getRegister(RISCV::X0, VT); } - Offset = CurDAG->getSignedConstant(Lo12, DL, VT, /*isTarget=*/true); + Offset = CurDAG->getSignedTargetConstant(Lo12, DL, VT); return true; } @@ -2577,7 +2575,7 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, assert(!Seq.empty() && "Expected more instructions in sequence"); Base = selectImmSeq(CurDAG, DL, VT, Seq); - Offset = CurDAG->getSignedConstant(Lo12, DL, VT, /*isTarget=*/true); + Offset = CurDAG->getSignedTargetConstant(Lo12, DL, VT); return true; } @@ -2727,7 +2725,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, if (auto *FIN = dyn_cast(Base)) Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT); - Offset = CurDAG->getSignedConstant(CVal, DL, VT, /*isTarget=*/true); + Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT); return true; } } @@ -2744,11 +2742,10 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, if (CVal >= -4096 && CVal <= (4094 - RV32ZdinxRange)) { int64_t Adj = CVal < 0 ? -2048 : 2047; Base = SDValue( - CurDAG->getMachineNode( - RISCV::ADDI, DL, VT, Addr.getOperand(0), - CurDAG->getSignedConstant(Adj, DL, VT, /*isTarget=*/true)), + CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0), + CurDAG->getSignedTargetConstant(Adj, DL, VT)), 0); - Offset = CurDAG->getSignedConstant(CVal - Adj, DL, VT, /*isTarget=*/true); + Offset = CurDAG->getSignedTargetConstant(CVal - Adj, DL, VT); return true; } @@ -2802,7 +2799,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, if (auto *FIN = dyn_cast(Base)) Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT); - Offset = CurDAG->getSignedConstant(CVal, DL, VT, /*isTarget=*/true); + Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT); return true; } } @@ -2818,12 +2815,12 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, if ((-2049 >= CVal && CVal >= -4096) || (4065 >= CVal && CVal >= 2017)) { int64_t Adj = CVal < 0 ? -2048 : 2016; int64_t AdjustedOffset = CVal - Adj; - Base = SDValue(CurDAG->getMachineNode( - RISCV::ADDI, DL, VT, Addr.getOperand(0), - CurDAG->getSignedConstant(AdjustedOffset, DL, VT, - /*isTarget=*/true)), - 0); - Offset = CurDAG->getSignedConstant(Adj, DL, VT, /*isTarget=*/true); + Base = + SDValue(CurDAG->getMachineNode( + RISCV::ADDI, DL, VT, Addr.getOperand(0), + CurDAG->getSignedTargetConstant(AdjustedOffset, DL, VT)), + 0); + Offset = CurDAG->getSignedTargetConstant(Adj, DL, VT); return true; } @@ -2969,21 +2966,21 @@ bool RISCVDAGToDAGISel::selectSETCC(SDValue N, ISD::CondCode ExpectedCCVal, // If the RHS is -2048, we can use xori to produce 0 if the LHS is -2048 and // non-zero otherwise. if (CVal == -2048) { - Val = SDValue(CurDAG->getMachineNode( - RISCV::XORI, DL, N->getValueType(0), LHS, - CurDAG->getSignedConstant(CVal, DL, N->getValueType(0), - /*isTarget=*/true)), - 0); + Val = SDValue( + CurDAG->getMachineNode( + RISCV::XORI, DL, N->getValueType(0), LHS, + CurDAG->getSignedTargetConstant(CVal, DL, N->getValueType(0))), + 0); return true; } // If the RHS is [-2047,2048], we can use addi with -RHS to produce 0 if the // LHS is equal to the RHS and non-zero otherwise. if (isInt<12>(CVal) || CVal == 2048) { - Val = SDValue(CurDAG->getMachineNode( - RISCV::ADDI, DL, N->getValueType(0), LHS, - CurDAG->getSignedConstant(-CVal, DL, N->getValueType(0), - /*isTarget=*/true)), - 0); + Val = SDValue( + CurDAG->getMachineNode( + RISCV::ADDI, DL, N->getValueType(0), LHS, + CurDAG->getSignedTargetConstant(-CVal, DL, N->getValueType(0))), + 0); return true; } if (isPowerOf2_64(CVal) && Subtarget->hasStdExtZbs()) { @@ -3424,8 +3421,7 @@ bool RISCVDAGToDAGISel::selectSimm5Shl2(SDValue N, SDValue &Simm5, return false; EVT Ty = N->getValueType(0); - Simm5 = CurDAG->getSignedConstant(Offset >> Shift, SDLoc(N), Ty, - /*isTarget=*/true); + Simm5 = CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(N), Ty); Shl2 = CurDAG->getTargetConstant(Shift, SDLoc(N), Ty); return true; } @@ -3442,16 +3438,16 @@ bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) { N->getValueType(0)); } else if (C && C->isAllOnes()) { // Treat all ones as VLMax. - VL = CurDAG->getSignedConstant(RISCV::VLMaxSentinel, SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + VL = CurDAG->getSignedTargetConstant(RISCV::VLMaxSentinel, SDLoc(N), + N->getValueType(0)); } else if (isa(N) && cast(N)->getReg() == RISCV::X0) { // All our VL operands use an operand that allows GPRNoX0 or an immediate // as the register class. Convert X0 to a special immediate to pass the // MachineVerifier. This is recognized specially by the vsetvli insertion // pass. - VL = CurDAG->getSignedConstant(RISCV::VLMaxSentinel, SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + VL = CurDAG->getSignedTargetConstant(RISCV::VLMaxSentinel, SDLoc(N), + N->getValueType(0)); } else { VL = N; } @@ -3509,8 +3505,8 @@ static bool selectVSplatImmHelper(SDValue N, SDValue &SplatVal, if (!ValidateImm(SplatImm)) return false; - SplatVal = DAG.getSignedConstant(SplatImm, SDLoc(N), Subtarget.getXLenVT(), - /*isTarget=*/true); + SplatVal = + DAG.getSignedTargetConstant(SplatImm, SDLoc(N), Subtarget.getXLenVT()); return true; } @@ -3610,8 +3606,8 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width, if (!isInt<5>(ImmVal)) return false; - Imm = CurDAG->getSignedConstant(ImmVal, SDLoc(N), Subtarget->getXLenVT(), - /*isTarget=*/true); + Imm = CurDAG->getSignedTargetConstant(ImmVal, SDLoc(N), + Subtarget->getXLenVT()); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 35040734d71df..675809348b0e6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20784,8 +20784,8 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint( if (auto *C = dyn_cast(Op)) { uint64_t CVal = C->getSExtValue(); if (isInt<12>(CVal)) - Ops.push_back(DAG.getSignedConstant( - CVal, SDLoc(Op), Subtarget.getXLenVT(), /*isTarget=*/true)); + Ops.push_back(DAG.getSignedTargetConstant(CVal, SDLoc(Op), + Subtarget.getXLenVT())); } return; case 'J': diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 1908f5e5dede8..5747f05ffafd4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -415,14 +415,14 @@ def AddrRegImm : ComplexPattern; // Return the negation of an immediate value. def NegImm : SDNodeXFormgetSignedConstant(-N->getSExtValue(), SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(-N->getSExtValue(), SDLoc(N), + N->getValueType(0)); }]>; // Return an immediate value minus 32. def ImmSub32 : SDNodeXFormgetSignedConstant(N->getSExtValue() - 32, SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(N->getSExtValue() - 32, SDLoc(N), + N->getValueType(0)); }]>; // Return an immediate subtracted from XLen. @@ -454,16 +454,15 @@ def AddiPair : PatLeaf<(imm), [{ def AddiPairImmSmall : SDNodeXFormgetSExtValue(); int64_t Adj = N->getSExtValue() < 0 ? -2048 : 2047; - return CurDAG->getSignedConstant(Imm - Adj, SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(Imm - Adj, SDLoc(N), + N->getValueType(0)); }]>; // Return -2048 if immediate is negative or 2047 if positive. These are the // largest simm12 values. def AddiPairImmLarge : SDNodeXFormgetSExtValue() < 0 ? -2048 : 2047; - return CurDAG->getSignedConstant(Imm, SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(Imm, SDLoc(N), N->getValueType(0)); }]>; def TrailingZeros : SDNodeXForm; def DecImm : SDNodeXFormgetSignedConstant(N->getSExtValue() - 1, SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(N->getSExtValue() - 1, SDLoc(N), + N->getValueType(0)); }]>; defvar TAIL_AGNOSTIC = 1; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index ccb851f9322d6..69e4b30dc4fc8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -145,8 +145,8 @@ def BCLRIANDIMask : PatLeaf<(imm), [{ }]>; def BCLRIANDIMaskLow : SDNodeXFormgetSignedConstant((N->getZExtValue() & 0x7ff) | ~0x7ffull, - SDLoc(N), N->getValueType(0), /*isTarget=*/true); + return CurDAG->getSignedTargetConstant((N->getZExtValue() & 0x7ff) | ~0x7ffull, + SDLoc(N), N->getValueType(0)); }]>; def CSImm12MulBy4 : PatLeaf<(imm), [{ @@ -167,13 +167,13 @@ def CSImm12MulBy8 : PatLeaf<(imm), [{ }]>; def SimmShiftRightBy2XForm : SDNodeXFormgetSignedConstant(N->getSExtValue() >> 2, SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(N->getSExtValue() >> 2, SDLoc(N), + N->getValueType(0)); }]>; def SimmShiftRightBy3XForm : SDNodeXFormgetSignedConstant(N->getSExtValue() >> 3, SDLoc(N), - N->getValueType(0), /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(N->getSExtValue() >> 3, SDLoc(N), + N->getValueType(0)); }]>; // Pattern to exclude simm12 immediates from matching, namely `non_imm12`. From 6a863f7e2679a60f2f38ae6a920d0b6e1a2c1690 Mon Sep 17 00:00:00 2001 From: lntue Date: Mon, 18 Nov 2024 12:44:32 -0800 Subject: [PATCH 033/366] [libc] Fix signed zeros for exp10m1f16 and tanhf16. (#116654) --- libc/src/math/generic/exp10m1f16.cpp | 3 +++ libc/src/math/generic/tanhf16.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp index 9f2c1959fa5ec..449aedf254ca5 100644 --- a/libc/src/math/generic/exp10m1f16.cpp +++ b/libc/src/math/generic/exp10m1f16.cpp @@ -119,6 +119,9 @@ LLVM_LIBC_FUNCTION(float16, exp10m1f16, (float16 x)) { // When |x| <= 2^(-3). if (x_abs <= 0x3000U) { + if (LIBC_UNLIKELY(x_abs == 0)) + return x; + if (auto r = EXP10M1F16_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value())) return r.value(); diff --git a/libc/src/math/generic/tanhf16.cpp b/libc/src/math/generic/tanhf16.cpp index ae9b4be46f7cf..0266b5cfc2df1 100644 --- a/libc/src/math/generic/tanhf16.cpp +++ b/libc/src/math/generic/tanhf16.cpp @@ -64,6 +64,9 @@ LLVM_LIBC_FUNCTION(float16, tanhf16, (float16 x)) { // When |x| <= 0x1.d2p-4. if (x_abs <= 0x2f48U) { + if (LIBC_UNLIKELY(x_abs == 0)) + return x; + float xf = x; float xf_sq = xf * xf; // Degree-7 Taylor expansion generated by Sollya with the following From e59582b6f8f1be3e675866f6a5d661eb4c8ed448 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Mon, 18 Nov 2024 16:04:41 -0500 Subject: [PATCH 034/366] [libc] avoid type-punning with inactive union member (#116685) --- libc/fuzzing/__support/hashtable_fuzz.cpp | 16 +++++++-------- .../HashTable/generic/bitmask_impl.inc | 11 +++++----- libc/src/__support/hash.h | 20 +++++++++---------- .../src/__support/HashTable/group_test.cpp | 15 +++++++------- .../src/__support/HashTable/table_test.cpp | 2 +- 5 files changed, 30 insertions(+), 34 deletions(-) diff --git a/libc/fuzzing/__support/hashtable_fuzz.cpp b/libc/fuzzing/__support/hashtable_fuzz.cpp index 7d61e106c9c4a..8ab5e3b55cfd4 100644 --- a/libc/fuzzing/__support/hashtable_fuzz.cpp +++ b/libc/fuzzing/__support/hashtable_fuzz.cpp @@ -10,6 +10,7 @@ /// //===----------------------------------------------------------------------===// #include "include/llvm-libc-types/ENTRY.h" +#include "src/__support/CPP/bit.h" #include "src/__support/CPP/string_view.h" #include "src/__support/HashTable/table.h" #include "src/__support/macros/config.h" @@ -81,15 +82,14 @@ static struct { template T next() { static_assert(cpp::is_integral::value, "T must be an integral type"); - union { - T result; - char data[sizeof(T)]; - }; - for (size_t i = 0; i < sizeof(result); i++) + + char data[sizeof(T)]; + + for (size_t i = 0; i < sizeof(T); i++) data[i] = buffer[i]; - buffer += sizeof(result); - remaining -= sizeof(result); - return result; + buffer += sizeof(T); + remaining -= sizeof(T); + return cpp::bit_cast(data); } cpp::string_view next_string() { diff --git a/libc/src/__support/HashTable/generic/bitmask_impl.inc b/libc/src/__support/HashTable/generic/bitmask_impl.inc index 469ddeeed8a85..d526dc1ece293 100644 --- a/libc/src/__support/HashTable/generic/bitmask_impl.inc +++ b/libc/src/__support/HashTable/generic/bitmask_impl.inc @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/CPP/bit.h" #include "src/__support/common.h" #include "src/__support/endian_internal.h" #include "src/__support/macros/config.h" @@ -44,13 +45,11 @@ struct Group { // Load a group of control words from an arbitary address. LIBC_INLINE static Group load(const void *addr) { - union { - bitmask_t value; - char bytes[sizeof(bitmask_t)]; - } data; + char bytes[sizeof(bitmask_t)]; + for (size_t i = 0; i < sizeof(bitmask_t); ++i) - data.bytes[i] = static_cast(addr)[i]; - return {data.value}; + bytes[i] = static_cast(addr)[i]; + return Group{cpp::bit_cast(bytes)}; } // Load a group of control words from an aligned address. diff --git a/libc/src/__support/hash.h b/libc/src/__support/hash.h index 527c83993fd59..49138b1f43b9e 100644 --- a/libc/src/__support/hash.h +++ b/libc/src/__support/hash.h @@ -13,8 +13,8 @@ #include "src/__support/CPP/limits.h" // numeric_limits #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/config.h" -#include "src/__support/uint128.h" // UInt128 -#include // For uint64_t +#include "src/__support/uint128.h" // UInt128 +#include // For uint64_t namespace LIBC_NAMESPACE_DECL { namespace internal { @@ -34,25 +34,23 @@ LIBC_INLINE uint64_t folded_multiply(uint64_t x, uint64_t y) { // Therefore, we use a union to read the value. template LIBC_INLINE T read_little_endian(const void *ptr) { const uint8_t *bytes = static_cast(ptr); - union { - T value; - uint8_t buffer[sizeof(T)]; - } data; + uint8_t buffer[sizeof(T)]; #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - // Compiler should able to optimize this as a load followed by a byte swap. - // On aarch64 (-mbig-endian), this compiles to the following for int: + // Compiler should able to optimize this as a load followed by a byte + // swap. On aarch64 (-mbig-endian), this compiles to the following for + // int: // ldr w0, [x0] // rev w0, w0 // ret for (size_t i = 0; i < sizeof(T); ++i) { - data.buffer[i] = bytes[sizeof(T) - i - 1]; + buffer[i] = bytes[sizeof(T) - i - 1]; } #else for (size_t i = 0; i < sizeof(T); ++i) { - data.buffer[i] = bytes[i]; + buffer[i] = bytes[i]; } #endif - return data.value; + return cpp::bit_cast(buffer); } // Specialized read functions for small values. size must be <= 8. diff --git a/libc/test/src/__support/HashTable/group_test.cpp b/libc/test/src/__support/HashTable/group_test.cpp index 25b15312ad668..acdc58e205852 100644 --- a/libc/test/src/__support/HashTable/group_test.cpp +++ b/libc/test/src/__support/HashTable/group_test.cpp @@ -8,6 +8,7 @@ #include "src/__support/HashTable/bitmask.h" +#include "src/__support/CPP/bit.h" #include "src/__support/macros/config.h" #include "src/stdlib/rand.h" #include "test/UnitTest/Test.h" @@ -28,14 +29,13 @@ TEST(LlvmLibcHashTableBitMaskTest, Match) { size_t appearance[4][sizeof(Group)]; ByteArray array{}; - union { - uintptr_t random; - int data[sizeof(uintptr_t) / sizeof(int)]; - }; + int data[sizeof(uintptr_t) / sizeof(int)]; for (int &i : data) i = rand(); + uintptr_t random = cpp::bit_cast(data); + for (size_t i = 0; i < sizeof(Group); ++i) { size_t choice = random % 4; random /= 4; @@ -62,14 +62,13 @@ TEST(LlvmLibcHashTableBitMaskTest, MaskAvailable) { for (size_t i = 0; i < sizeof(Group); ++i) { ByteArray array{}; - union { - uintptr_t random; - int data[sizeof(uintptr_t) / sizeof(int)]; - }; + int data[sizeof(uintptr_t) / sizeof(int)]; for (int &j : data) j = rand(); + uintptr_t random = cpp::bit_cast(data); + ASSERT_FALSE(Group::load(array.data).mask_available().any_bit_set()); array.data[i] = 0x80; diff --git a/libc/test/src/__support/HashTable/table_test.cpp b/libc/test/src/__support/HashTable/table_test.cpp index f8ffa4d4123d3..c3b8697f2087a 100644 --- a/libc/test/src/__support/HashTable/table_test.cpp +++ b/libc/test/src/__support/HashTable/table_test.cpp @@ -82,7 +82,7 @@ TEST(LlvmLibcTableTest, GrowthSequence) { } TEST(LlvmLibcTableTest, Insertion) { - union key { + struct key { char bytes[2]; } keys[256]; for (size_t k = 0; k < 256; ++k) { From ce0cc8e9eb1ee5613a6fb442179a92c3fabf27c5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 18 Nov 2024 12:34:32 -0800 Subject: [PATCH 035/366] [AArch64][VE][X86] Use getSignedTargetConstant. NFC --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 5 ++--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +-- llvm/lib/Target/VE/VEInstrInfo.td | 4 ++-- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 6 ++---- llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 3 +-- 5 files changed, 8 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 1969c830f4d31..10dad7675f4ea 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -920,8 +920,7 @@ bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) { if ((MulImm % std::abs(Scale)) == 0) { int64_t RDVLImm = MulImm / Scale; if ((RDVLImm >= Low) && (RDVLImm <= High)) { - Imm = CurDAG->getSignedConstant(RDVLImm, SDLoc(N), MVT::i32, - /*isTarget=*/true); + Imm = CurDAG->getSignedTargetConstant(RDVLImm, SDLoc(N), MVT::i32); return true; } } @@ -4283,7 +4282,7 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) { int64_t ImmVal = CNode->getSExtValue(); SDLoc DL(N); if (ImmVal >= -128 && ImmVal < 128) { - Imm = CurDAG->getSignedConstant(ImmVal, DL, MVT::i32, /*isTarget=*/true); + Imm = CurDAG->getSignedTargetConstant(ImmVal, DL, MVT::i32); return true; } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2732e495c552a..ad1d1237aa25a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9299,8 +9299,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Each tail call may have to adjust the stack by a different amount, so // this information must travel along with the operation for eventual // consumption by emitEpilogue. - Ops.push_back( - DAG.getSignedConstant(FPDiff, DL, MVT::i32, /*isTarget=*/true)); + Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32)); } if (CLI.PAI) { diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index eb6a852980079..b459fbcad909f 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -44,8 +44,8 @@ def ULO7 : SDNodeXForm; def LO7 : SDNodeXFormgetSignedConstant(SignExtend64(N->getSExtValue(), 7), - SDLoc(N), MVT::i32, /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(SignExtend64(N->getSExtValue(), 7), + SDLoc(N), MVT::i32); }]>; def MIMM : SDNodeXFormgetTargetConstant(val2MImm(getImmVal(N)), diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 72de0e0e8761f..0641dca07a890 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -313,8 +313,7 @@ namespace { Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, AM.SymbolFlags); else - Disp = - CurDAG->getSignedConstant(AM.Disp, DL, MVT::i32, /*isTarget=*/true); + Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32); if (AM.Segment.getNode()) Segment = AM.Segment; @@ -3775,8 +3774,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { } if (MemVT != MVT::i64 || isInt<32>(OperandV)) { - Operand = CurDAG->getSignedConstant(OperandV, SDLoc(Node), MemVT, - /*isTarget=*/true); + Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImmOpcode(Opc); } } diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 3bf61f22c9f1f..8a764de561413 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2428,8 +2428,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Ops.push_back(Callee); if (isTailCall) - Ops.push_back( - DAG.getSignedConstant(FPDiff, dl, MVT::i32, /*isTarget=*/true)); + Ops.push_back(DAG.getSignedTargetConstant(FPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. From cde4ae789e4a2f408d06d2b0045cca22c201c47b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 18 Nov 2024 12:42:33 -0800 Subject: [PATCH 036/366] [ARM] Use getSignedTargetConstant. NFC --- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 48 ++++++++++--------------- llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 ++-- llvm/lib/Target/ARM/ARMInstrInfo.td | 8 ++--- llvm/lib/Target/ARM/ARMInstrThumb2.td | 4 +-- 4 files changed, 27 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 5c45e081e1b16..73ee8cf81adcd 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -710,8 +710,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } - OffImm = CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32, - /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } } @@ -880,8 +879,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N, if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits. if (AddSub == ARM_AM::sub) Val *= -1; Offset = CurDAG->getRegister(0, MVT::i32); - Opc = - CurDAG->getSignedConstant(Val, SDLoc(Op), MVT::i32, /*isTarget*/ true); + Opc = CurDAG->getSignedTargetConstant(Val, SDLoc(Op), MVT::i32); return true; } @@ -1185,8 +1183,7 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, int RHSC; if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) { Base = N.getOperand(0); - OffImm = - CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32, /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } @@ -1248,8 +1245,7 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, if (MFI.getObjectAlign(FI) >= Align(4)) { Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); - OffImm = CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32, - /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } } @@ -1269,8 +1265,8 @@ bool ARMDAGToDAGISel::SelectTAddrModeImm7(SDValue N, SDValue &Base, Base = N.getOperand(0); if (N.getOpcode() == ISD::SUB) RHSC = -RHSC; - OffImm = CurDAG->getSignedConstant(RHSC * (1 << Shift), SDLoc(N), - MVT::i32, /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(RHSC * (1 << Shift), SDLoc(N), + MVT::i32); return true; } } @@ -1332,8 +1328,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } - OffImm = CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32, - /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } } @@ -1359,9 +1354,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, if (N.getOpcode() == ISD::SUB) RHSC = -RHSC; - OffImm = - CurDAG->getSignedConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32, - /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(RHSC * (1 << Shift), SDLoc(N), + MVT::i32); return true; } } @@ -1391,8 +1385,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } - OffImm = CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32, - /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32); return true; } } @@ -1409,10 +1402,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, int RHSC; if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits. OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) - ? CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32, - /*isTarget=*/true) - : CurDAG->getSignedConstant(-RHSC, SDLoc(N), MVT::i32, - /*isTarget=*/true); + ? CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32) + : CurDAG->getSignedTargetConstant(-RHSC, SDLoc(N), MVT::i32); return true; } @@ -1435,8 +1426,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, SDValue &Base, if (N.getOpcode() == ISD::SUB) RHSC = -RHSC; - OffImm = CurDAG->getSignedConstant(RHSC * (1 << Shift), SDLoc(N), - MVT::i32, /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(RHSC * (1 << Shift), SDLoc(N), + MVT::i32); return true; } } @@ -1479,10 +1470,10 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, // 7 bit constant, shifted by Shift. if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) - ? CurDAG->getSignedConstant(RHSC * (1 << Shift), SDLoc(N), - MVT::i32, /*isTarget=*/true) - : CurDAG->getSignedConstant(-RHSC * (1 << Shift), SDLoc(N), - MVT::i32, /*isTarget=*/true); + ? CurDAG->getSignedTargetConstant(RHSC * (1 << Shift), + SDLoc(N), MVT::i32) + : CurDAG->getSignedTargetConstant(-RHSC * (1 << Shift), + SDLoc(N), MVT::i32); return true; } return false; @@ -1492,8 +1483,7 @@ template bool ARMDAGToDAGISel::SelectImmediateInRange(SDValue N, SDValue &OffImm) { int Val; if (isScaledConstantInRange(N, 1, Min, Max, Val)) { - OffImm = - CurDAG->getSignedConstant(Val, SDLoc(N), MVT::i32, /*isTarget=*/true); + OffImm = CurDAG->getSignedTargetConstant(Val, SDLoc(N), MVT::i32); return true; } return false; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 7fce91f97f361..554f7337a6a5a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2970,8 +2970,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Ops.push_back(Callee); if (isTailCall) { - Ops.push_back( - DAG.getSignedConstant(SPDiff, dl, MVT::i32, /*isTarget=*/true)); + Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32)); } // Add argument registers to the end of the list so that they are known live @@ -20615,8 +20614,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, } return; } - Result = DAG.getSignedConstant(CVal, SDLoc(Op), Op.getValueType(), - /*isTarget=*/true); + Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType()); break; } diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index d24d4af36f0d8..72146f2a717ad 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -371,14 +371,14 @@ def ARMVCCElse : PatLeaf<(i32 2)>; // imm_neg_XFORM - Return the negation of an i32 immediate value. def imm_neg_XFORM : SDNodeXFormgetSignedConstant(-(int)N->getZExtValue(), SDLoc(N), MVT::i32, - /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(-(int)N->getZExtValue(), SDLoc(N), + MVT::i32); }]>; // imm_not_XFORM - Return the complement of a i32 immediate value. def imm_not_XFORM : SDNodeXFormgetSignedConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32, - /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(~(int)N->getZExtValue(), SDLoc(N), + MVT::i32); }]>; def gi_imm_not_XFORM : GICustomOperandRenderer<"renderInvertedImm">, GISDNodeXFormEquiv; diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index cb20aacb539ad..4e9160bcfd5ec 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -77,8 +77,8 @@ def t2_so_imm_not_XFORM : SDNodeXFormgetSignedConstant(-((int)N->getZExtValue()), SDLoc(N), - MVT::i32, /*isTarget=*/true); + return CurDAG->getSignedTargetConstant(-((int)N->getZExtValue()), SDLoc(N), + MVT::i32); }]>; // so_imm_notSext_XFORM - Return a so_imm value packed into the format From b42a81631491571c4b78d095917ebdddee69b04f Mon Sep 17 00:00:00 2001 From: jimingham Date: Mon, 18 Nov 2024 13:23:17 -0800 Subject: [PATCH 037/366] Convert ThreadPlanStack's mutex to a shared mutex. (#116438) I have some reports of A/B inversion deadlocks between the ThreadPlanStack and the StackFrameList accesses. There's a fair bit of reasonable code in lldb that does "While accessing the ThreadPlanStack, look at that threads's StackFrameList", and also plenty of "While accessing the ThreadPlanStack, look at the StackFrameList." In all the cases I've seen so far, there was at most one of the locks taken that were trying to mutate the list, the other three were just reading. So we could solve the deadlock by converting the two mutexes over to shared mutexes. This patch is the easy part, the ThreadPlanStack mutex. The tricky part was because these were originally recursive mutexes, and recursive access to shared mutexes is undefined behavior according to the C++ standard, I had to add a couple NoLock variants to make sure it didn't get used recursively. Then since the only remaining calls are out to ThreadPlans and ThreadPlans don't have access to their containing ThreadPlanStack, converting this to a non-recursive lock should be safe. --- lldb/include/lldb/Target/ThreadPlanStack.h | 13 ++- lldb/source/Target/ThreadPlanStack.cpp | 108 +++++++++++---------- 2 files changed, 68 insertions(+), 53 deletions(-) diff --git a/lldb/include/lldb/Target/ThreadPlanStack.h b/lldb/include/lldb/Target/ThreadPlanStack.h index e6a560a509261..e0f8104de9a4d 100644 --- a/lldb/include/lldb/Target/ThreadPlanStack.h +++ b/lldb/include/lldb/Target/ThreadPlanStack.h @@ -14,6 +14,8 @@ #include #include +#include "llvm/Support/RWMutex.h" + #include "lldb/Target/Target.h" #include "lldb/Target/Thread.h" #include "lldb/lldb-private-forward.h" @@ -96,9 +98,12 @@ class ThreadPlanStack { void ClearThreadCache(); private: - void PrintOneStack(Stream &s, llvm::StringRef stack_name, - const PlanStack &stack, lldb::DescriptionLevel desc_level, - bool include_internal) const; + lldb::ThreadPlanSP DiscardPlanNoLock(); + lldb::ThreadPlanSP GetCurrentPlanNoLock() const; + void PrintOneStackNoLock(Stream &s, llvm::StringRef stack_name, + const PlanStack &stack, + lldb::DescriptionLevel desc_level, + bool include_internal) const; PlanStack m_plans; ///< The stack of plans this thread is executing. PlanStack m_completed_plans; ///< Plans that have been completed by this @@ -110,7 +115,7 @@ class ThreadPlanStack { size_t m_completed_plan_checkpoint = 0; // Monotonically increasing token for // completed plan checkpoints. std::unordered_map m_completed_plan_store; - mutable std::recursive_mutex m_stack_mutex; + mutable llvm::sys::RWMutex m_stack_mutex; }; class ThreadPlanStackMap { diff --git a/lldb/source/Target/ThreadPlanStack.cpp b/lldb/source/Target/ThreadPlanStack.cpp index 1572931429071..d5d600dda47a3 100644 --- a/lldb/source/Target/ThreadPlanStack.cpp +++ b/lldb/source/Target/ThreadPlanStack.cpp @@ -39,21 +39,21 @@ ThreadPlanStack::ThreadPlanStack(const Thread &thread, bool make_null) { void ThreadPlanStack::DumpThreadPlans(Stream &s, lldb::DescriptionLevel desc_level, bool include_internal) const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); s.IndentMore(); - PrintOneStack(s, "Active plan stack", m_plans, desc_level, include_internal); - PrintOneStack(s, "Completed plan stack", m_completed_plans, desc_level, - include_internal); - PrintOneStack(s, "Discarded plan stack", m_discarded_plans, desc_level, - include_internal); + PrintOneStackNoLock(s, "Active plan stack", m_plans, desc_level, + include_internal); + PrintOneStackNoLock(s, "Completed plan stack", m_completed_plans, desc_level, + include_internal); + PrintOneStackNoLock(s, "Discarded plan stack", m_discarded_plans, desc_level, + include_internal); s.IndentLess(); } -void ThreadPlanStack::PrintOneStack(Stream &s, llvm::StringRef stack_name, - const PlanStack &stack, - lldb::DescriptionLevel desc_level, - bool include_internal) const { - std::lock_guard guard(m_stack_mutex); +void ThreadPlanStack::PrintOneStackNoLock(Stream &s, llvm::StringRef stack_name, + const PlanStack &stack, + lldb::DescriptionLevel desc_level, + bool include_internal) const { // If the stack is empty, just exit: if (stack.empty()) return; @@ -82,7 +82,7 @@ void ThreadPlanStack::PrintOneStack(Stream &s, llvm::StringRef stack_name, } size_t ThreadPlanStack::CheckpointCompletedPlans() { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); m_completed_plan_checkpoint++; m_completed_plan_store.insert( std::make_pair(m_completed_plan_checkpoint, m_completed_plans)); @@ -90,7 +90,7 @@ size_t ThreadPlanStack::CheckpointCompletedPlans() { } void ThreadPlanStack::RestoreCompletedPlanCheckpoint(size_t checkpoint) { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); auto result = m_completed_plan_store.find(checkpoint); assert(result != m_completed_plan_store.end() && "Asked for a checkpoint that didn't exist"); @@ -99,13 +99,13 @@ void ThreadPlanStack::RestoreCompletedPlanCheckpoint(size_t checkpoint) { } void ThreadPlanStack::DiscardCompletedPlanCheckpoint(size_t checkpoint) { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); m_completed_plan_store.erase(checkpoint); } void ThreadPlanStack::ThreadDestroyed(Thread *thread) { // Tell the plan stacks that this thread is going away: - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); for (ThreadPlanSP plan : m_plans) plan->ThreadDestroyed(); @@ -134,20 +134,22 @@ void ThreadPlanStack::PushPlan(lldb::ThreadPlanSP new_plan_sp) { // If the thread plan doesn't already have a tracer, give it its parent's // tracer: // The first plan has to be a base plan: - std::lock_guard guard(m_stack_mutex); - assert((m_plans.size() > 0 || new_plan_sp->IsBasePlan()) && - "Zeroth plan must be a base plan"); - - if (!new_plan_sp->GetThreadPlanTracer()) { - assert(!m_plans.empty()); - new_plan_sp->SetThreadPlanTracer(m_plans.back()->GetThreadPlanTracer()); + { // Scope for Lock - DidPush often adds plans to the stack: + llvm::sys::ScopedWriter guard(m_stack_mutex); + assert((m_plans.size() > 0 || new_plan_sp->IsBasePlan()) && + "Zeroth plan must be a base plan"); + + if (!new_plan_sp->GetThreadPlanTracer()) { + assert(!m_plans.empty()); + new_plan_sp->SetThreadPlanTracer(m_plans.back()->GetThreadPlanTracer()); + } + m_plans.push_back(new_plan_sp); } - m_plans.push_back(new_plan_sp); new_plan_sp->DidPush(); } lldb::ThreadPlanSP ThreadPlanStack::PopPlan() { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); assert(m_plans.size() > 1 && "Can't pop the base thread plan"); // Note that moving the top element of the vector would leave it in an @@ -161,7 +163,11 @@ lldb::ThreadPlanSP ThreadPlanStack::PopPlan() { } lldb::ThreadPlanSP ThreadPlanStack::DiscardPlan() { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); + return DiscardPlanNoLock(); +} + +lldb::ThreadPlanSP ThreadPlanStack::DiscardPlanNoLock() { assert(m_plans.size() > 1 && "Can't discard the base thread plan"); // Note that moving the top element of the vector would leave it in an @@ -177,12 +183,12 @@ lldb::ThreadPlanSP ThreadPlanStack::DiscardPlan() { // If the input plan is nullptr, discard all plans. Otherwise make sure this // plan is in the stack, and if so discard up to and including it. void ThreadPlanStack::DiscardPlansUpToPlan(ThreadPlan *up_to_plan_ptr) { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); int stack_size = m_plans.size(); if (up_to_plan_ptr == nullptr) { for (int i = stack_size - 1; i > 0; i--) - DiscardPlan(); + DiscardPlanNoLock(); return; } @@ -197,23 +203,23 @@ void ThreadPlanStack::DiscardPlansUpToPlan(ThreadPlan *up_to_plan_ptr) { if (found_it) { bool last_one = false; for (int i = stack_size - 1; i > 0 && !last_one; i--) { - if (GetCurrentPlan().get() == up_to_plan_ptr) + if (GetCurrentPlanNoLock().get() == up_to_plan_ptr) last_one = true; - DiscardPlan(); + DiscardPlanNoLock(); } } } void ThreadPlanStack::DiscardAllPlans() { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); int stack_size = m_plans.size(); for (int i = stack_size - 1; i > 0; i--) { - DiscardPlan(); + DiscardPlanNoLock(); } } void ThreadPlanStack::DiscardConsultingControllingPlans() { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); while (true) { int controlling_plan_idx; bool discard = true; @@ -234,26 +240,30 @@ void ThreadPlanStack::DiscardConsultingControllingPlans() { // First pop all the dependent plans: for (int i = m_plans.size() - 1; i > controlling_plan_idx; i--) { - DiscardPlan(); + DiscardPlanNoLock(); } // Now discard the controlling plan itself. // The bottom-most plan never gets discarded. "OkayToDiscard" for it // means discard it's dependent plans, but not it... if (controlling_plan_idx > 0) { - DiscardPlan(); + DiscardPlanNoLock(); } } } lldb::ThreadPlanSP ThreadPlanStack::GetCurrentPlan() const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); + return GetCurrentPlanNoLock(); +} + +lldb::ThreadPlanSP ThreadPlanStack::GetCurrentPlanNoLock() const { assert(m_plans.size() != 0 && "There will always be a base plan."); return m_plans.back(); } lldb::ThreadPlanSP ThreadPlanStack::GetCompletedPlan(bool skip_private) const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); if (m_completed_plans.empty()) return {}; @@ -271,7 +281,7 @@ lldb::ThreadPlanSP ThreadPlanStack::GetCompletedPlan(bool skip_private) const { lldb::ThreadPlanSP ThreadPlanStack::GetPlanByIndex(uint32_t plan_idx, bool skip_private) const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); uint32_t idx = 0; for (lldb::ThreadPlanSP plan_sp : m_plans) { @@ -285,7 +295,7 @@ lldb::ThreadPlanSP ThreadPlanStack::GetPlanByIndex(uint32_t plan_idx, } lldb::ValueObjectSP ThreadPlanStack::GetReturnValueObject() const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); if (m_completed_plans.empty()) return {}; @@ -299,7 +309,7 @@ lldb::ValueObjectSP ThreadPlanStack::GetReturnValueObject() const { } lldb::ExpressionVariableSP ThreadPlanStack::GetExpressionVariable() const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); if (m_completed_plans.empty()) return {}; @@ -312,23 +322,23 @@ lldb::ExpressionVariableSP ThreadPlanStack::GetExpressionVariable() const { return {}; } bool ThreadPlanStack::AnyPlans() const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); // There is always a base plan... return m_plans.size() > 1; } bool ThreadPlanStack::AnyCompletedPlans() const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); return !m_completed_plans.empty(); } bool ThreadPlanStack::AnyDiscardedPlans() const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); return !m_discarded_plans.empty(); } bool ThreadPlanStack::IsPlanDone(ThreadPlan *in_plan) const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); for (auto plan : m_completed_plans) { if (plan.get() == in_plan) return true; @@ -337,7 +347,7 @@ bool ThreadPlanStack::IsPlanDone(ThreadPlan *in_plan) const { } bool ThreadPlanStack::WasPlanDiscarded(ThreadPlan *in_plan) const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); for (auto plan : m_discarded_plans) { if (plan.get() == in_plan) return true; @@ -346,7 +356,7 @@ bool ThreadPlanStack::WasPlanDiscarded(ThreadPlan *in_plan) const { } ThreadPlan *ThreadPlanStack::GetPreviousPlan(ThreadPlan *current_plan) const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); if (current_plan == nullptr) return nullptr; @@ -361,7 +371,7 @@ ThreadPlan *ThreadPlanStack::GetPreviousPlan(ThreadPlan *current_plan) const { // If this is the first completed plan, the previous one is the // bottom of the regular plan stack. if (stack_size > 0 && m_completed_plans[0].get() == current_plan) { - return GetCurrentPlan().get(); + return GetCurrentPlanNoLock().get(); } // Otherwise look for it in the regular plans. @@ -374,7 +384,7 @@ ThreadPlan *ThreadPlanStack::GetPreviousPlan(ThreadPlan *current_plan) const { } ThreadPlan *ThreadPlanStack::GetInnermostExpression() const { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); int stack_size = m_plans.size(); for (int i = stack_size - 1; i > 0; i--) { @@ -385,13 +395,13 @@ ThreadPlan *ThreadPlanStack::GetInnermostExpression() const { } void ThreadPlanStack::ClearThreadCache() { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedReader guard(m_stack_mutex); for (lldb::ThreadPlanSP thread_plan_sp : m_plans) thread_plan_sp->ClearThreadCache(); } void ThreadPlanStack::WillResume() { - std::lock_guard guard(m_stack_mutex); + llvm::sys::ScopedWriter guard(m_stack_mutex); m_completed_plans.clear(); m_discarded_plans.clear(); } From e44c28f07ede2bd693e2372317880f57a635fa73 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Mon, 18 Nov 2024 15:28:17 -0600 Subject: [PATCH 038/366] [clang] Replace "can't" and "can not" in diagnostics with "cannot" (#116623) See https://discourse.llvm.org/t/cant-cannot-can-not-in-diagnostic-messages/83171 --- .../clang/Basic/DiagnosticCommonKinds.td | 6 ++--- .../clang/Basic/DiagnosticDriverKinds.td | 2 +- .../clang/Basic/DiagnosticRefactoringKinds.td | 4 ++-- .../clang/Basic/DiagnosticSemaKinds.td | 16 +++++++------- .../musttail-forward-declaration-inline.c | 2 +- .../musttail-forward-declaration-weak.c | 2 +- .../CodeGen/PowerPC/musttail-indirect.cpp | 2 +- clang/test/CodeGen/PowerPC/musttail-inline.c | 2 +- .../test/CodeGen/PowerPC/musttail-undefined.c | 2 +- clang/test/CodeGen/PowerPC/musttail-weak.c | 2 +- clang/test/CodeGen/PowerPC/musttail.c | 2 +- clang/test/CodeGen/X86/x86_64-PR42672.c | 10 ++++----- clang/test/Driver/module-output.cppm | 2 +- .../Misc/pragma-attribute-strict-subjects.c | 6 ++--- clang/test/Modules/no-eager-load.cppm | 4 ++-- .../same-decl-in-different-modules.cppm | 8 +++---- clang/test/OpenMP/for_simd_loop_messages.cpp | 2 +- .../masked_taskloop_simd_linear_messages.cpp | 2 +- .../master_taskloop_simd_linear_messages.cpp | 2 +- .../parallel_for_simd_loop_messages.cpp | 2 +- .../OpenMP/parallel_for_simd_messages.cpp | 2 +- ...l_masked_taskloop_simd_linear_messages.cpp | 2 +- ...l_master_taskloop_simd_linear_messages.cpp | 2 +- clang/test/OpenMP/simd_linear_messages.cpp | 2 +- ...get_parallel_for_simd_ordered_messages.cpp | 22 +++++++++---------- .../OpenMP/taskloop_simd_linear_messages.cpp | 2 +- clang/test/Parser/pragma-attribute.cpp | 12 +++++----- clang/test/Refactor/Extract/ObjCProperty.m | 2 +- clang/test/Sema/asm.c | 4 ++-- .../Sema/pragma-attribute-strict-subjects.c | 18 +++++++-------- clang/test/SemaObjC/comptypes-legal.m | 2 +- clang/test/SemaOpenCL/access-qualifier.cl | 8 +++---- 32 files changed, 79 insertions(+), 79 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index 0c131166aff28..f4a155bb00bb3 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -364,9 +364,9 @@ def err_target_unsupported_abi_with_fpu : Error< def err_ppc_impossible_musttail: Error< "'musttail' attribute for this call is impossible because %select{" - "long calls can not be tail called on PPC|" - "indirect calls can not be tail called on PPC|" - "external calls can not be tail called on PPC}0" + "long calls cannot be tail called on PPC|" + "indirect calls cannot be tail called on PPC|" + "external calls cannot be tail called on PPC}0" >; def err_aix_musttail_unsupported: Error< "'musttail' attribute is not supported on AIX">; diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 76fdbdbfb01d9..5155b23d151c0 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -553,7 +553,7 @@ def err_test_module_file_extension_format : Error< "'blockname:major:minor:hashed:user info'">; def err_drv_module_output_with_multiple_arch : Error< - "option '-fmodule-output' can't be used with multiple arch options">; + "option '-fmodule-output' cannot be used with multiple arch options">; def warn_drv_delayed_template_parsing_after_cxx20 : Warning< "-fdelayed-template-parsing is deprecated after C++20">, diff --git a/clang/include/clang/Basic/DiagnosticRefactoringKinds.td b/clang/include/clang/Basic/DiagnosticRefactoringKinds.td index 5446b32efbdd4..e060fffc7280a 100644 --- a/clang/include/clang/Basic/DiagnosticRefactoringKinds.td +++ b/clang/include/clang/Basic/DiagnosticRefactoringKinds.td @@ -14,7 +14,7 @@ let Component = "Refactoring" in { let CategoryName = "Refactoring Invocation Issue" in { -def err_refactor_no_selection : Error<"refactoring action can't be initiated " +def err_refactor_no_selection : Error<"refactoring action cannot be initiated " "without a selection">; def err_refactor_selection_no_symbol : Error<"there is no symbol at the given " "location">; @@ -26,7 +26,7 @@ def err_refactor_code_outside_of_function : Error<"the selected code is not a " def err_refactor_extract_simple_expression : Error<"the selected expression " "is too simple to extract">; def err_refactor_extract_prohibited_expression : Error<"the selected " - "expression can't be extracted">; + "expression cannot be extracted">; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 17eb28e8fc562..3caf471d3037f 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -1151,7 +1151,7 @@ def err_pragma_attribute_matcher_subrule_contradicts_rule : Error< def err_pragma_attribute_matcher_negated_subrule_contradicts_subrule : Error< "negated attribute subject matcher sub-rule '%0' contradicts sub-rule '%1'">; def err_pragma_attribute_invalid_matchers : Error< - "attribute %0 can't be applied to %1">; + "attribute %0 cannot be applied to %1">; def err_pragma_attribute_stack_mismatch : Error< "'#pragma clang attribute %select{%1.|}0pop' with no matching" " '#pragma clang attribute %select{%1.|}0push'">; @@ -6150,7 +6150,7 @@ def err_mismatched_owning_module : Error< "declaration of %0 in %select{the global module|module %2}1 follows " "declaration in %select{the global module|module %4}3">; def err_multiple_decl_in_different_modules : Error< - "declaration %0 attached to named module '%1' can't be attached to " + "declaration %0 attached to named module '%1' cannot be attached to " "other modules">; def err_redefinition_different_type : Error< "redefinition of %0 with a different type%diff{: $ vs $|}1,2">; @@ -8560,7 +8560,7 @@ def err_typecheck_missing_return_type_incompatible : Error< "literal|lambda expression}2 has unspecified explicit return type">; def note_incomplete_class_and_qualified_id : Note< - "conformance of forward class %0 to protocol %1 can not be confirmed">; + "conformance of forward class %0 to protocol %1 cannot be confirmed">; def warn_incompatible_qualified_id : Warning< "%select{%diff{assigning to $ from incompatible type $|" "assigning to type from incompatible type}0,1" @@ -9414,7 +9414,7 @@ let CategoryName = "Inline Assembly Issue" in { "asm constraint has an unexpected number of alternatives: %0 vs %1">; def err_asm_incomplete_type : Error<"asm operand has incomplete type %0">; def err_asm_unknown_register_name : Error<"unknown register name '%0' in asm">; - def err_asm_unwind_and_goto : Error<"unwind clobber can't be used with asm goto">; + def err_asm_unwind_and_goto : Error<"unwind clobber cannot be used with asm goto">; def err_asm_invalid_global_var_reg : Error<"register '%0' unsuitable for " "global register variables on this target">; def err_asm_register_size_mismatch : Error<"size of register '%0' does not " @@ -9433,7 +9433,7 @@ let CategoryName = "Inline Assembly Issue" in { def err_asm_input_duplicate_match : Error< "more than one input constraint matches the same output '%0'">; def err_store_value_to_reg : Error< - "impossible constraint in asm: can't store value into a register">; + "impossible constraint in asm: cannot store value into a register">; def warn_asm_label_on_auto_decl : Warning< "ignored asm label '%0' on automatic variable">; @@ -10960,7 +10960,7 @@ def err_opencl_builtin_pipe_invalid_access_modifier : Error< def err_opencl_invalid_access_qualifier : Error< "access qualifier can only be used for pipe and image type">; def err_opencl_invalid_read_write : Error< - "access qualifier %0 can not be used for %1 %select{|prior to OpenCL C version 2.0 or in version 3.0 " + "access qualifier %0 cannot be used for %1 %select{|prior to OpenCL C version 2.0 or in version 3.0 " "and without __opencl_c_read_write_images feature}2">; def err_opencl_multiple_access_qualifiers : Error< "multiple access qualifiers">; @@ -11460,7 +11460,7 @@ def err_omp_wrong_linear_modifier : Error< def err_omp_wrong_linear_modifier_non_reference : Error< "variable of non-reference type %0 can be used only with 'val' modifier, but used with '%1'">; def err_omp_step_simple_modifier_exclusive : Error< - "step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier">; + "step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier">; def err_omp_wrong_simdlen_safelen_values : Error< "the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter">; def err_omp_wrong_if_directive_name_modifier : Error< @@ -11534,7 +11534,7 @@ def err_omp_schedule_nonmonotonic_static : Error< def err_omp_simple_clause_incompatible_with_ordered : Error< "'%0' clause with '%1' modifier cannot be specified if an 'ordered' clause is specified">; def err_omp_ordered_simd : Error< - "'ordered' clause with a parameter can not be specified in '#pragma omp %0' directive">; + "'ordered' clause with a parameter cannot be specified in '#pragma omp %0' directive">; def err_omp_variable_in_given_clause_and_dsa : Error< "%0 variable cannot be in a %1 clause in '#pragma omp %2' directive">; def err_omp_param_or_this_in_clause : Error< diff --git a/clang/test/CodeGen/PowerPC/musttail-forward-declaration-inline.c b/clang/test/CodeGen/PowerPC/musttail-forward-declaration-inline.c index 3d8ff3985cb0f..d0ec21209582e 100644 --- a/clang/test/CodeGen/PowerPC/musttail-forward-declaration-inline.c +++ b/clang/test/CodeGen/PowerPC/musttail-forward-declaration-inline.c @@ -3,7 +3,7 @@ inline int func2(int i); int external_call2(int i) { - // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}} + // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}} [[clang::musttail]] return func2(i); } diff --git a/clang/test/CodeGen/PowerPC/musttail-forward-declaration-weak.c b/clang/test/CodeGen/PowerPC/musttail-forward-declaration-weak.c index 4314bbdd30619..57226d2109f32 100644 --- a/clang/test/CodeGen/PowerPC/musttail-forward-declaration-weak.c +++ b/clang/test/CodeGen/PowerPC/musttail-forward-declaration-weak.c @@ -3,7 +3,7 @@ int func2(int i); int external_call2(int i) { - // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}} + // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}} [[clang::musttail]] return func2(i); } diff --git a/clang/test/CodeGen/PowerPC/musttail-indirect.cpp b/clang/test/CodeGen/PowerPC/musttail-indirect.cpp index 3f495002606d4..cc506d4f7bc1f 100644 --- a/clang/test/CodeGen/PowerPC/musttail-indirect.cpp +++ b/clang/test/CodeGen/PowerPC/musttail-indirect.cpp @@ -3,6 +3,6 @@ void name(int *params) { auto fn = (void (*)(int *))1; - // expected-error@+1 {{'musttail' attribute for this call is impossible because indirect calls can not be tail called on PPC}} + // expected-error@+1 {{'musttail' attribute for this call is impossible because indirect calls cannot be tail called on PPC}} [[clang::musttail]] return fn(params); } diff --git a/clang/test/CodeGen/PowerPC/musttail-inline.c b/clang/test/CodeGen/PowerPC/musttail-inline.c index 05aac88697127..1ac841f088cf5 100644 --- a/clang/test/CodeGen/PowerPC/musttail-inline.c +++ b/clang/test/CodeGen/PowerPC/musttail-inline.c @@ -7,6 +7,6 @@ inline int foo(int x) { int bar(int x) { - // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}} + // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}} [[clang::musttail]] return foo(1); } diff --git a/clang/test/CodeGen/PowerPC/musttail-undefined.c b/clang/test/CodeGen/PowerPC/musttail-undefined.c index f2259adb01848..fb3845218a622 100644 --- a/clang/test/CodeGen/PowerPC/musttail-undefined.c +++ b/clang/test/CodeGen/PowerPC/musttail-undefined.c @@ -5,6 +5,6 @@ int foo(int x); int bar(int x) { - // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}} + // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}} [[clang::musttail]] return foo(x); } diff --git a/clang/test/CodeGen/PowerPC/musttail-weak.c b/clang/test/CodeGen/PowerPC/musttail-weak.c index dccc7a4d8cdd2..1070b91bc5f35 100644 --- a/clang/test/CodeGen/PowerPC/musttail-weak.c +++ b/clang/test/CodeGen/PowerPC/musttail-weak.c @@ -7,7 +7,7 @@ __attribute__((weak)) int func2(int i) { return 0; } int external_call2(int i) { - // linux-error@+2 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}} + // linux-error@+2 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}} // aix-error@+1 {{'musttail' attribute is not supported on AIX}} [[clang::musttail]] return func2(i); } diff --git a/clang/test/CodeGen/PowerPC/musttail.c b/clang/test/CodeGen/PowerPC/musttail.c index e3129263d2460..7a74d084c67be 100644 --- a/clang/test/CodeGen/PowerPC/musttail.c +++ b/clang/test/CodeGen/PowerPC/musttail.c @@ -14,7 +14,7 @@ int foo(int x) { int bar(int x) { // good-no-diagnostics - // longcall-error@+2 {{'musttail' attribute for this call is impossible because long calls can not be tail called on PPC}} + // longcall-error@+2 {{'musttail' attribute for this call is impossible because long calls cannot be tail called on PPC}} // aix-error@+1 {{'musttail' attribute is not supported on AIX}} [[clang::musttail]] return foo(1); } diff --git a/clang/test/CodeGen/X86/x86_64-PR42672.c b/clang/test/CodeGen/X86/x86_64-PR42672.c index 6fe612d0aabdb..42894c0c4cb57 100644 --- a/clang/test/CodeGen/X86/x86_64-PR42672.c +++ b/clang/test/CodeGen/X86/x86_64-PR42672.c @@ -58,7 +58,7 @@ void odd_struct(void) { : "=r"(str)); #endif } -// CHECK-IMPOSSIBLE_ODD: impossible constraint in asm: can't store value into a register +// CHECK-IMPOSSIBLE_ODD: impossible constraint in asm: cannot store value into a register // Check Clang reports an error if attempting to return a big structure via a register. void big_struct(void) { @@ -70,7 +70,7 @@ void big_struct(void) { : "=r"(str)); #endif } -// CHECK-IMPOSSIBLE_BIG: impossible constraint in asm: can't store value into a register +// CHECK-IMPOSSIBLE_BIG: impossible constraint in asm: cannot store value into a register // Clang is able to emit LLVM IR for an 16-byte structure. void x_constraint_fit(void) { @@ -103,7 +103,7 @@ void x_constraint_nofit(void) { // http://crbug.com/999160 // Clang used to report the following message: -// "impossible constraint in asm: can't store struct into a register" +// "impossible constraint in asm: cannot store struct into a register" // for the assembly directive below, although there's no struct. void crbug_999160_regtest(void) { #ifdef IMPOSSIBLE_9BYTES @@ -113,7 +113,7 @@ void crbug_999160_regtest(void) { #endif } -// CHECK-IMPOSSIBLE_9BYTES: impossible constraint in asm: can't store value into a register +// CHECK-IMPOSSIBLE_9BYTES: impossible constraint in asm: cannot store value into a register void crbug_999160_regtest_v2(void) { #ifdef IMPOSSIBLE_9BYTES_V2 @@ -121,4 +121,4 @@ void crbug_999160_regtest_v2(void) { asm("" : "=r"(buf) : "0"(buf)); #endif } -// CHECK-IMPOSSIBLE_9BYTES_V2: impossible constraint in asm: can't store value into a register +// CHECK-IMPOSSIBLE_9BYTES_V2: impossible constraint in asm: cannot store value into a register diff --git a/clang/test/Driver/module-output.cppm b/clang/test/Driver/module-output.cppm index bf7bfbf3cb574..7cf0771f3d6ef 100644 --- a/clang/test/Driver/module-output.cppm +++ b/clang/test/Driver/module-output.cppm @@ -42,7 +42,7 @@ export module Hello; // CHECK: "-emit-module-interface" {{.*}}"-main-file-name" "Hello.cppm" {{.*}}"-o" "{{.*}}/output/Hello.pcm" "-x" "c++" "{{.*}}/Hello.cppm" // CHECK: "-emit-obj" {{.*}}"-main-file-name" "Hello.cppm" {{.*}}"-o" "{{.*}}/output/Hello.o" "-x" "pcm" "{{.*}}/output/Hello.pcm" -// MULTIPLE-ARCH: option '-fmodule-output' can't be used with multiple arch options +// MULTIPLE-ARCH: option '-fmodule-output' cannot be used with multiple arch options // CHECK-SPECIFIED: "-emit-module-interface" {{.*}}"-main-file-name" "Hello.cppm" {{.*}}"-o" "{{.*}}/pcm/Hello.pcm" "-x" "c++" "{{.*}}/Hello.cppm" // CHECK-SPECIFIED: "-emit-obj" {{.*}}"-main-file-name" "Hello.cppm" {{.*}}"-o" "{{.*}}/Hello.o" "-x" "pcm" "{{.*}}/pcm/Hello.pcm" diff --git a/clang/test/Misc/pragma-attribute-strict-subjects.c b/clang/test/Misc/pragma-attribute-strict-subjects.c index 7c2548c7dfc26..807977fb252aa 100644 --- a/clang/test/Misc/pragma-attribute-strict-subjects.c +++ b/clang/test/Misc/pragma-attribute-strict-subjects.c @@ -51,7 +51,7 @@ struct testRecoverStrictnessStruct { }; #pragma clang attribute pop #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(function, record(unless(is_union)), variable, enum)) -// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}} +// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}} int testRecoverExtraVar = 0; // CHECK-LABEL: VarDecl{{.*}} testRecoverExtraVar @@ -188,7 +188,7 @@ struct testSubset7Struct { }; #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(record(unless(is_union)), function, variable, enum, enum_constant)) -// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum_constant', and 'enum'}} +// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum_constant', and 'enum'}} int testSubsetRecoverVar; // CHECK-LABEL: VarDecl{{.*}} testSubsetRecoverVar @@ -205,7 +205,7 @@ struct testSubsetRecoverStruct { }; #pragma clang attribute pop #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = enum) -// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}} +// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}} int testSubsetNoVar; // CHECK-LABEL: VarDecl{{.*}} testSubsetNoVar diff --git a/clang/test/Modules/no-eager-load.cppm b/clang/test/Modules/no-eager-load.cppm index c9eddaaed1555..aa6de44c998f3 100644 --- a/clang/test/Modules/no-eager-load.cppm +++ b/clang/test/Modules/no-eager-load.cppm @@ -44,7 +44,7 @@ void use() { // expected-note@* {{but in 'a' found a different body}} } -// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' can't be attached to other modules}} +// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' cannot be attached to other modules}} // expected-note@b.cppm:* {{}} //--- h.cppm @@ -59,5 +59,5 @@ void use() { // expected-note@* {{but in 'a' found a different body}} } -// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' can't be attached to other modules}} +// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' cannot be attached to other modules}} // expected-note@b.cppm:* {{}} diff --git a/clang/test/Modules/same-decl-in-different-modules.cppm b/clang/test/Modules/same-decl-in-different-modules.cppm index 2e8e90f7cd8e9..8ad9e29051d4e 100644 --- a/clang/test/Modules/same-decl-in-different-modules.cppm +++ b/clang/test/Modules/same-decl-in-different-modules.cppm @@ -32,11 +32,11 @@ void test() { S s; } -// expected-error@mod1.cppm:* {{declaration 'v' attached to named module 'mod1' can't be attached to other modules}} +// expected-error@mod1.cppm:* {{declaration 'v' attached to named module 'mod1' cannot be attached to other modules}} // expected-note@mod2.cppm:* {{}} -// expected-error@mod1.cppm:* {{declaration 'func' attached to named module 'mod1' can't be attached to other modules}} +// expected-error@mod1.cppm:* {{declaration 'func' attached to named module 'mod1' cannot be attached to other modules}} // expected-note@mod2.cppm:* {{}} -// expected-error@mod1.cppm:* {{declaration 'A' attached to named module 'mod1' can't be attached to other modules}} +// expected-error@mod1.cppm:* {{declaration 'A' attached to named module 'mod1' cannot be attached to other modules}} // expected-note@mod2.cppm:* {{}} -// expected-error@mod1.cppm:* 1+{{declaration 'S' attached to named module 'mod1' can't be attached to other modules}} +// expected-error@mod1.cppm:* 1+{{declaration 'S' attached to named module 'mod1' cannot be attached to other modules}} // expected-note@mod2.cppm:* 1+{{}} diff --git a/clang/test/OpenMP/for_simd_loop_messages.cpp b/clang/test/OpenMP/for_simd_loop_messages.cpp index 1cc5988ea8092..74a52f3f5d694 100644 --- a/clang/test/OpenMP/for_simd_loop_messages.cpp +++ b/clang/test/OpenMP/for_simd_loop_messages.cpp @@ -731,7 +731,7 @@ void test_ordered() { for (int i = 0; i < 16; ++i) ; #pragma omp parallel -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp for simd' directive}} #pragma omp for simd ordered(1) for (int i = 0; i < 16; ++i) ; diff --git a/clang/test/OpenMP/masked_taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/masked_taskloop_simd_linear_messages.cpp index 50d2da7e8fd4d..6072ad1b92445 100644 --- a/clang/test/OpenMP/masked_taskloop_simd_linear_messages.cpp +++ b/clang/test/OpenMP/masked_taskloop_simd_linear_messages.cpp @@ -152,7 +152,7 @@ template int foomain(I argc, C **argv) { #pragma omp masked taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}} for (int k = 0; k < argc; ++k) ++k; #if defined(OMP52) - // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}} + // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}} // expected-error@+2 {{linear variable with incomplete type 'S1'}} // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}} #pragma omp masked taskloop simd linear (a, b: val, B::ib) diff --git a/clang/test/OpenMP/master_taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/master_taskloop_simd_linear_messages.cpp index ee29f63e110c0..c1bf61b8183ec 100644 --- a/clang/test/OpenMP/master_taskloop_simd_linear_messages.cpp +++ b/clang/test/OpenMP/master_taskloop_simd_linear_messages.cpp @@ -152,7 +152,7 @@ template int foomain(I argc, C **argv) { #pragma omp master taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}} for (int k = 0; k < argc; ++k) ++k; #if defined(OMP52) - // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}} + // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}} // expected-error@+2 {{linear variable with incomplete type 'S1'}} // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}} #pragma omp master taskloop simd linear (a, b: val, B::ib) diff --git a/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp b/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp index f55453f6e8e15..4760a0281df54 100644 --- a/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp +++ b/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp @@ -638,7 +638,7 @@ void test_ordered() { #pragma omp parallel for simd ordered for (int i = 0; i < 16; ++i) ; -//expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp parallel for simd' directive}} +//expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp parallel for simd' directive}} #pragma omp parallel for simd ordered(1) for (int i = 0; i < 16; ++i) ; diff --git a/clang/test/OpenMP/parallel_for_simd_messages.cpp b/clang/test/OpenMP/parallel_for_simd_messages.cpp index 8237406a1c068..b3408fab4417f 100644 --- a/clang/test/OpenMP/parallel_for_simd_messages.cpp +++ b/clang/test/OpenMP/parallel_for_simd_messages.cpp @@ -94,7 +94,7 @@ void test_ordered() { #pragma omp parallel for simd ordered for (int i = 0; i < 16; ++i) ; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp parallel for simd' directive}} #pragma omp parallel for simd ordered(1) for (int i = 0; i < 16; ++i) ; diff --git a/clang/test/OpenMP/parallel_masked_taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/parallel_masked_taskloop_simd_linear_messages.cpp index a913a4e331964..bda3ef09181a6 100644 --- a/clang/test/OpenMP/parallel_masked_taskloop_simd_linear_messages.cpp +++ b/clang/test/OpenMP/parallel_masked_taskloop_simd_linear_messages.cpp @@ -152,7 +152,7 @@ template int foomain(I argc, C **argv) { #pragma omp parallel masked taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}} for (int k = 0; k < argc; ++k) ++k; #if defined(OMP52) - // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}} + // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}} // expected-error@+2 {{linear variable with incomplete type 'S1'}} // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}} #pragma omp parallel masked taskloop simd linear (a, b: val, B::ib) diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_linear_messages.cpp index 2be29fdc6b929..01a734cd927e2 100644 --- a/clang/test/OpenMP/parallel_master_taskloop_simd_linear_messages.cpp +++ b/clang/test/OpenMP/parallel_master_taskloop_simd_linear_messages.cpp @@ -152,7 +152,7 @@ template int foomain(I argc, C **argv) { #pragma omp parallel master taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}} for (int k = 0; k < argc; ++k) ++k; #if defined(OMP52) - // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}} + // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}} // expected-error@+2 {{linear variable with incomplete type 'S1'}} // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}} #pragma omp parallel master taskloop simd linear (a, b: val, B::ib) diff --git a/clang/test/OpenMP/simd_linear_messages.cpp b/clang/test/OpenMP/simd_linear_messages.cpp index a19fad9d7718a..68a2999fdf65a 100644 --- a/clang/test/OpenMP/simd_linear_messages.cpp +++ b/clang/test/OpenMP/simd_linear_messages.cpp @@ -142,7 +142,7 @@ template int foomain(I argc, C **argv) { #pragma omp simd linear (S1) // expected-error {{'S1' does not refer to a value}} for (int k = 0; k < argc; ++k) ++k; #if defined(OMP52) - // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}} + // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}} // expected-error@+2 {{linear variable with incomplete type 'S1'}} // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}} #pragma omp simd linear (a, b: val, B::ib) diff --git a/clang/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp b/clang/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp index 8dd7f68c25fd8..73ea96eb24278 100644 --- a/clang/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp +++ b/clang/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp @@ -29,26 +29,26 @@ T tmain(T argc, S **argv) { #pragma omp target parallel for simd ordered() // expected-error {{expected expression}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; -// expected-error@+2 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+2 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} #pragma omp target parallel for simd ordered(argc for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(ST // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered((ST > 0) ? 1 + ST : 2) for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error@+3 {{argument to 'ordered' clause must be a strictly positive integer value}} // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'ordered' clause}} -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(foobool(argc)), ordered(true), ordered(-5) for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; @@ -60,15 +60,15 @@ T tmain(T argc, S **argv) { #pragma omp target parallel for simd ordered(j = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(1) for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(N) for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i - ST]; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(2) foo(); return argc; @@ -85,11 +85,11 @@ int main(int argc, char **argv) { #pragma omp target parallel for simd ordered() // expected-error {{expected expression}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i - 4]; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(4 // expected-error {{expected ')'}} expected-note {{to match this '('}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i - 4]; -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(2 + 2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}} for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i - 4]; @@ -115,7 +115,7 @@ int main(int argc, char **argv) { // expected-error@+2 {{statement after '#pragma omp target parallel for simd' must be a for loop}} #pragma omp target parallel for simd ordered(ordered(tmain < int, char, -1, -2 > (argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} foo(); -// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}} +// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}} #pragma omp target parallel for simd ordered(2) foo(); return tmain(argc, argv); diff --git a/clang/test/OpenMP/taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/taskloop_simd_linear_messages.cpp index 22e2d26cb5561..5bf4785f14be4 100644 --- a/clang/test/OpenMP/taskloop_simd_linear_messages.cpp +++ b/clang/test/OpenMP/taskloop_simd_linear_messages.cpp @@ -152,7 +152,7 @@ template int foomain(I argc, C **argv) { #pragma omp taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}} for (int k = 0; k < argc; ++k) ++k; #if defined(OMP52) - // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}} + // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}} // expected-error@+2 {{linear variable with incomplete type 'S1'}} // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}} #pragma omp taskloop simd linear (a, b: val, B::ib) diff --git a/clang/test/Parser/pragma-attribute.cpp b/clang/test/Parser/pragma-attribute.cpp index 6377fc754352e..d5b1f848abd06 100644 --- a/clang/test/Parser/pragma-attribute.cpp +++ b/clang/test/Parser/pragma-attribute.cpp @@ -124,7 +124,7 @@ void function(); #pragma clang attribute push (__attribute__((annotate)), apply_to=function foo) // expected-error {{extra tokens after attribute in a '#pragma clang attribute push'}} #pragma clang attribute push (__attribute__((objc_bridge_related)), apply_to=function) -// expected-error@-1 {{attribute 'objc_bridge_related' can't be applied to 'function'}} +// expected-error@-1 {{attribute 'objc_bridge_related' cannot be applied to 'function'}} #pragma clang attribute pop #pragma clang attribute push (__attribute__((objc_bridge_related(1))), apply_to=function) // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}} @@ -182,15 +182,15 @@ _Pragma("clang attribute pop"); #pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_local)) #pragma clang attribute pop -#pragma clang attribute push([[clang::uninitialized]], apply_to = function) // expected-error {{attribute 'uninitialized' can't be applied to 'function'}} +#pragma clang attribute push([[clang::uninitialized]], apply_to = function) // expected-error {{attribute 'uninitialized' cannot be applied to 'function'}} #pragma clang attribute pop -#pragma clang attribute push([[clang::uninitialized]], apply_to = variable) // expected-error {{attribute 'uninitialized' can't be applied to 'variable'}} +#pragma clang attribute push([[clang::uninitialized]], apply_to = variable) // expected-error {{attribute 'uninitialized' cannot be applied to 'variable'}} #pragma clang attribute pop -#pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_thread_local)) // expected-error {{attribute 'uninitialized' can't be applied to 'variable(is_thread_local)'}} +#pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_thread_local)) // expected-error {{attribute 'uninitialized' cannot be applied to 'variable(is_thread_local)'}} #pragma clang attribute pop -#pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_global)) // expected-error {{attribute 'uninitialized' can't be applied to 'variable(is_global)'}} +#pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_global)) // expected-error {{attribute 'uninitialized' cannot be applied to 'variable(is_global)'}} #pragma clang attribute pop -#pragma clang attribute push([[clang::uninitialized]], apply_to = any(variable(is_parameter), variable(unless(is_parameter)))) // expected-error {{attribute 'uninitialized' can't be applied to 'variable(is_parameter)', and 'variable(unless(is_parameter))'}} +#pragma clang attribute push([[clang::uninitialized]], apply_to = any(variable(is_parameter), variable(unless(is_parameter)))) // expected-error {{attribute 'uninitialized' cannot be applied to 'variable(is_parameter)', and 'variable(unless(is_parameter))'}} #pragma clang attribute pop // We're allowed to apply attributes to subsets of allowed subjects. #pragma clang attribute push([[clang::no_destroy]], apply_to = variable) diff --git a/clang/test/Refactor/Extract/ObjCProperty.m b/clang/test/Refactor/Extract/ObjCProperty.m index 152ccb3484215..23c9a8941b7ab 100644 --- a/clang/test/Refactor/Extract/ObjCProperty.m +++ b/clang/test/Refactor/Extract/ObjCProperty.m @@ -36,6 +36,6 @@ - (void)prohibitSetterExtraction { /*range prohibit_setter=->+0:55*/self.implicitSetter = 0; } // CHECK: 2 'prohibit_setter' results: -// CHECK: the selected expression can't be extracted +// CHECK: the selected expression cannot be extracted @end diff --git a/clang/test/Sema/asm.c b/clang/test/Sema/asm.c index 28ef3ec6ce09c..a9cff5947ef5d 100644 --- a/clang/test/Sema/asm.c +++ b/clang/test/Sema/asm.c @@ -90,7 +90,7 @@ int test7(unsigned long long b) { // PR3904 void test8(int i) { - // A number in an input constraint can't point to a read-write constraint. + // A number in an input constraint cannot point to a read-write constraint. asm("" : "+r" (i), "=r"(i) : "0" (i)); // expected-error{{invalid input constraint '0' in asm}} } @@ -359,7 +359,7 @@ void test19(long long x) asm ("" : "=rm" (x): "0" (a)); // expected-error {{unsupported inline asm: input with type 'st_size64' (aka 'struct _st_size64') matching output with type 'long long'}} // FIXME: This case is actually supported by codegen. asm ("" : "=rm" (a): "0" (d)); // expected-error {{unsupported inline asm: input with type 'st_size32' (aka 'struct _st_size32') matching output with type 'st_size64' (aka 'struct _st_size64')}} - asm ("" : "=rm" (b): "0" (1)); // expected-error {{impossible constraint in asm: can't store value into a register}} + asm ("" : "=rm" (b): "0" (1)); // expected-error {{impossible constraint in asm: cannot store value into a register}} // FIXME: This case should be supported by codegen, but it fails now. asm ("" : "=rm" (e): "0" (1)); // no-error // FIXME: This case should be supported by codegen, but it fails now. diff --git a/clang/test/Sema/pragma-attribute-strict-subjects.c b/clang/test/Sema/pragma-attribute-strict-subjects.c index 4f37c271ce34a..85b484799529a 100644 --- a/clang/test/Sema/pragma-attribute-strict-subjects.c +++ b/clang/test/Sema/pragma-attribute-strict-subjects.c @@ -52,16 +52,16 @@ #pragma clang attribute pop #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(function, record(unless(is_union)), variable, enum)) -// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}} +// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}} #pragma clang attribute pop #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(enum_constant, function, record(unless(is_union)), variable, variable(is_parameter), enum)) // FIXME: comma in this diagnostic is wrong. -// expected-error@-2 {{attribute 'abi_tag' can't be applied to 'enum_constant', and 'enum'}} +// expected-error@-2 {{attribute 'abi_tag' cannot be applied to 'enum_constant', and 'enum'}} #pragma clang attribute pop #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(function, record(unless(is_union)), enum)) -// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}} +// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}} #pragma clang attribute pop // Verify the non-strict subject set verification. @@ -96,12 +96,12 @@ #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(record(unless(is_union)), function, variable, enum, enum_constant)) -// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum_constant', and 'enum'}} +// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum_constant', and 'enum'}} #pragma clang attribute pop #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = enum) -// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}} +// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}} #pragma clang attribute pop @@ -124,21 +124,21 @@ #pragma clang attribute pop #pragma clang attribute push(__attribute__((objc_subclassing_restricted)), apply_to = any(objc_interface, objc_protocol)) -// expected-error@-1 {{attribute 'objc_subclassing_restricted' can't be applied to 'objc_protocol'}} +// expected-error@-1 {{attribute 'objc_subclassing_restricted' cannot be applied to 'objc_protocol'}} #pragma clang attribute pop #pragma clang attribute push(__attribute__((objc_subclassing_restricted)), apply_to = any(objc_protocol)) -// expected-error@-1 {{attribute 'objc_subclassing_restricted' can't be applied to 'objc_protocol'}} +// expected-error@-1 {{attribute 'objc_subclassing_restricted' cannot be applied to 'objc_protocol'}} // Don't report an error about missing 'objc_interface' as we aren't parsing // Objective-C. #pragma clang attribute pop #pragma clang attribute push(__attribute__((objc_subclassing_restricted)), apply_to = any(objc_interface, objc_protocol)) -// expected-error@-1 {{attribute 'objc_subclassing_restricted' can't be applied to 'objc_protocol'}} +// expected-error@-1 {{attribute 'objc_subclassing_restricted' cannot be applied to 'objc_protocol'}} #pragma clang attribute pop #pragma clang attribute push(__attribute__((objc_subclassing_restricted)), apply_to = any(objc_protocol)) -// expected-error@-1 {{attribute 'objc_subclassing_restricted' can't be applied to 'objc_protocol'}} +// expected-error@-1 {{attribute 'objc_subclassing_restricted' cannot be applied to 'objc_protocol'}} // Don't report an error about missing 'objc_interface' as we aren't parsing // Objective-C. #pragma clang attribute pop diff --git a/clang/test/SemaObjC/comptypes-legal.m b/clang/test/SemaObjC/comptypes-legal.m index 09c3a7261bd58..8e332f42be842 100644 --- a/clang/test/SemaObjC/comptypes-legal.m +++ b/clang/test/SemaObjC/comptypes-legal.m @@ -41,7 +41,7 @@ @interface I - (void) Meth : (id )aKey; // expected-note {{passing argument to parameter 'aKey' here}} @end -@class ForwarClass; // expected-note 3 {{conformance of forward class 'ForwarClass' to protocol 'NSCopying' can not be confirmed}} +@class ForwarClass; // expected-note 3 {{conformance of forward class 'ForwarClass' to protocol 'NSCopying' cannot be confirmed}} ForwarClass *Test10751015 (I* pi, ForwarClass *ns_forward) { diff --git a/clang/test/SemaOpenCL/access-qualifier.cl b/clang/test/SemaOpenCL/access-qualifier.cl index 726253c0b1a23..d1c9b5e35af6c 100644 --- a/clang/test/SemaOpenCL/access-qualifier.cl +++ b/clang/test/SemaOpenCL/access-qualifier.cl @@ -36,7 +36,7 @@ void myRead(read_only image1d_t); #if (__OPENCL_C_VERSION__ == 200) || ((__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300) && defined(__opencl_c_read_write_images)) void myReadWrite(read_write image1d_t); #else -void myReadWrite(read_write image1d_t); // expected-error {{access qualifier 'read_write' can not be used for '__read_write image1d_t' prior to OpenCL C version 2.0 or in version 3.0 and without __opencl_c_read_write_images feature}} +void myReadWrite(read_write image1d_t); // expected-error {{access qualifier 'read_write' cannot be used for '__read_write image1d_t' prior to OpenCL C version 2.0 or in version 3.0 and without __opencl_c_read_write_images feature}} #endif @@ -94,9 +94,9 @@ kernel void k11(read_only write_only image1d_t i){} // expected-error{{multiple kernel void k12(read_only read_only image1d_t i){} // expected-warning {{duplicate 'read_only' declaration specifier}} #if (__OPENCL_C_VERSION__ == 200) || ((__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300) && defined(__opencl_c_read_write_images)) -kernel void k13(read_write pipe int i){} // expected-error{{access qualifier 'read_write' can not be used for 'read_only pipe int'}} +kernel void k13(read_write pipe int i){} // expected-error{{access qualifier 'read_write' cannot be used for 'read_only pipe int'}} #else -kernel void k13(__read_write image1d_t i){} // expected-error{{access qualifier '__read_write' can not be used for '__read_write image1d_t' prior to OpenCL C version 2.0 or in version 3.0 and without __opencl_c_read_write_images feature}} +kernel void k13(__read_write image1d_t i){} // expected-error{{access qualifier '__read_write' cannot be used for '__read_write image1d_t' prior to OpenCL C version 2.0 or in version 3.0 and without __opencl_c_read_write_images feature}} #endif #if defined(__OPENCL_C_VERSION__) && __OPENCL_C_VERSION__ < 200 @@ -116,7 +116,7 @@ kernel void k14(read_only pipe int p) { kernel void pipe_ro_twice(read_only read_only pipe int i){} // expected-warning{{duplicate 'read_only' declaration specifier}} // Conflicting access qualifiers -kernel void pipe_ro_twice_tw(read_write read_only read_only pipe int i){} // expected-error{{access qualifier 'read_write' can not be used for 'read_only pipe int'}} +kernel void pipe_ro_twice_tw(read_write read_only read_only pipe int i){} // expected-error{{access qualifier 'read_write' cannot be used for 'read_only pipe int'}} kernel void pipe_ro_wo(read_only write_only pipe int i){} // expected-error{{multiple access qualifiers}} typedef read_only pipe int ROPipeInt; From 6dceb0e34ed3dd4be72d211abb8c9c447bd57735 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 18 Nov 2024 13:33:05 -0800 Subject: [PATCH 039/366] AMDGPU: Add V_CVT_F32_BF16 for gfx950 (#116311) --- llvm/lib/Target/AMDGPU/AMDGPU.td | 12 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 5 + llvm/test/MC/AMDGPU/gfx950_asm_vop1.s | 75 +++++++++++- .../Disassembler/AMDGPU/gfx950_dasm_vop1.txt | 110 +++++++++++++++++- 6 files changed, 205 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 09f8dde07b740..2039d9ccc8116 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -438,6 +438,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16", "Use true 16-bit registers" >; +def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts", + "HasBF16ConversionInsts", + "true", + "Has bf16 conversion instructions" +>; + def FeatureVOP3P : SubtargetFeature<"vop3p", "HasVOP3PInsts", "true", @@ -1504,7 +1510,8 @@ def FeatureISAVersion9_5_Common : FeatureSet< FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug, FeatureGFX950Insts, - FeaturePrngInst + FeaturePrngInst, + FeatureBF16ConversionInsts ])>; def FeatureISAVersion9_4_0 : FeatureSet< @@ -2144,6 +2151,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; +def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">, + AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>; + def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 334322f533e54..ece26a4adb375 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -51,6 +51,7 @@ class AMDGPUSubtarget { bool Has16BitInsts = false; bool HasTrue16BitInsts = false; bool EnableRealTrue16Insts = false; + bool HasBF16ConversionInsts = false; bool HasMadMixInsts = false; bool HasMadMacF32Insts = false; bool HasDsSrc2Insts = false; @@ -166,6 +167,10 @@ class AMDGPUSubtarget { // supported and the support for fake True16 instructions is removed. bool useRealTrue16Insts() const; + bool hasBF16ConversionInsts() const { + return HasBF16ConversionInsts; + } + bool hasMadMixInsts() const { return HasMadMixInsts; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 9f8e6a082d965..71bd5ece32bc4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2774,6 +2774,7 @@ def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>; def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>; def VOP_I64_I64 : VOPProfile <[i64, i64, untyped, untyped]>; +def VOP_F32_BF16 : VOPProfile <[f32, bf16, untyped, untyped]>; def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index e99f562688926..3cda173207dfb 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -311,6 +311,9 @@ let OtherPredicates = [UseRealTrue16Insts] in let OtherPredicates = [UseFakeTrue16Insts] in defm V_CVT_F32_F16_fake16 : VOP1Inst <"v_cvt_f32_f16_fake16", VOPProfile_Fake16, any_fpextend>; +let SubtargetPredicate = HasBF16ConversionInsts in +defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>; + let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; @@ -1514,6 +1517,8 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; let AssemblerPredicate = isGFX940Plus in defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>; +defm V_CVT_F32_BF16 : VOP1_Real_gfx9 <0x5b>; + defm V_CVT_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>; defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>; defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>; diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s index 0cb292ffe63dd..66dae85ee8e3e 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s @@ -1,4 +1,5 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck -check-prefix=GFX940-ERR --strict-whitespace %s v_prng_b32 v5, v1 // GFX950: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0xb1,0x0a,0x7e] @@ -55,3 +56,75 @@ v_prng_b32 v5, src_scc v_prng_b32 v255, 0xaf123456 // GFX950: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf] // GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, v1 +// GFX950: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xb7,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, v127 +// GFX950: v_cvt_f32_bf16_e32 v5, v127 ; encoding: [0x7f,0xb7,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, s1 +// GFX950: v_cvt_f32_bf16_e32 v5, s1 ; encoding: [0x01,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, vcc_lo +// GFX950: v_cvt_f32_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, vcc_hi +// GFX950: v_cvt_f32_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, ttmp15 +// GFX950: v_cvt_f32_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, m0 +// GFX950: v_cvt_f32_bf16_e32 v5, m0 ; encoding: [0x7c,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, exec_lo +// GFX950: v_cvt_f32_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, exec_hi +// GFX950: v_cvt_f32_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, -1 +// GFX950: v_cvt_f32_bf16_e32 v5, -1 ; encoding: [0xc1,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, 0.5 +// GFX950: v_cvt_f32_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, src_scc +// GFX950: v_cvt_f32_bf16_e32 v5, src_scc ; encoding: [0xfd,0xb6,0x0a,0x7e] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v127, 0x8000 +// GFX950: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, -v1 +// GFX950: v_cvt_f32_bf16_e64 v5, -v1 ; encoding: [0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, |v1| +// GFX950: v_cvt_f32_bf16_e64 v5, |v1| ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, -|v1| +// GFX950: v_cvt_f32_bf16_e64 v5, -|v1| ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16 v5, v1 clamp mul:2 +// GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp mul:2 ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x08] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_cvt_f32_bf16_e64 v5, v1 clamp div:2 +// GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp div:2 ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18] +// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt index 91ab05e99f1e7..336a26907891a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt @@ -40,4 +40,112 @@ 0xfd,0xb0,0x0a,0x7e # GFX950: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf] -0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf \ No newline at end of file +0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf + +# GFX950: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xb7,0x0a,0x7e] +0x01,0xb7,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, v127 ; encoding: [0x7f,0xb7,0x0a,0x7e] +0x7f,0xb7,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, s1 ; encoding: [0x01,0xb6,0x0a,0x7e] +0x01,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xb6,0x0a,0x7e] +0x6a,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xb6,0x0a,0x7e] +0x6b,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xb6,0x0a,0x7e] +0x7b,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, m0 ; encoding: [0x7c,0xb6,0x0a,0x7e] +0x7c,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xb6,0x0a,0x7e] +0x7e,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xb6,0x0a,0x7e] +0x7f,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, -1 ; encoding: [0xc1,0xb6,0x0a,0x7e] +0xc1,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xb6,0x0a,0x7e] +0xf0,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, src_scc ; encoding: [0xfd,0xb6,0x0a,0x7e] +0xfd,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00] +0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00 + +# GFX950: v_cvt_f32_bf16_e64 v5, -v1 ; encoding: [0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20] +0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20 + +# GFX950: v_cvt_f32_bf16_e64 v5, |v1| ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00] +0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00 + +# GFX950: v_cvt_f32_bf16_e64 v5, -|v1| ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20] +0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20 + +# GFX950: v_cvt_f32_bf16_e64 v5, 0.5 clamp mul:2 ; encoding: [0x05,0x80,0x9b,0xd1,0xf0,0x00,0x00,0x08] +0x05,0x80,0x9b,0xd1,0xf0,0x00,0x00,0x08 + +# GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp div:2 ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18] +0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18 + +# GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp div:2 ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18] +0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18 + +# GFX950: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xb7,0x0a,0x7e] +0x01,0xb7,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, v127 ; encoding: [0x7f,0xb7,0x0a,0x7e] +0x7f,0xb7,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, s1 ; encoding: [0x01,0xb6,0x0a,0x7e] +0x01,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xb6,0x0a,0x7e] +0x6a,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xb6,0x0a,0x7e] +0x6b,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xb6,0x0a,0x7e] +0x7b,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, m0 ; encoding: [0x7c,0xb6,0x0a,0x7e] +0x7c,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xb6,0x0a,0x7e] +0x7e,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xb6,0x0a,0x7e] +0x7f,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, -1 ; encoding: [0xc1,0xb6,0x0a,0x7e] +0xc1,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xb6,0x0a,0x7e] +0xf0,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v5, src_scc ; encoding: [0xfd,0xb6,0x0a,0x7e] +0xfd,0xb6,0x0a,0x7e + +# GFX950: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00] +0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00 + +# GFX950: v_cvt_f32_bf16_e64 v5, -v1 ; encoding: [0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20] +0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20 + +# GFX950: v_cvt_f32_bf16_e64 v5, |v1| ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00] +0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00 + +# GFX950: v_cvt_f32_bf16_e64 v5, -|v1| ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20] +0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20 + +# GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp mul:2 ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x08] +0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x08 From 0c421687f897b530a0fd3481fa03a2d29fd0b97c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 18 Nov 2024 13:38:07 -0800 Subject: [PATCH 040/366] AMDGPU: Add first gfx950 mfma instructions (#116312) Scheduling info and hazards are wrong and TBD. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 6 + .../CodeGenOpenCL/builtins-amdgcn-mfma.cl | 25 +- .../builtins-amdgcn-error-gfx950-param.cl | 21 ++ .../builtins-amdgcn-error-gfx950.cl | 12 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 9 + llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 +- .../Target/AMDGPU/AMDGPUSearchableTables.td | 2 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 4 + llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 22 ++ .../UniformityAnalysis/AMDGPU/intrinsics.ll | 17 ++ .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 274 ++++++++++++++++++ llvm/test/MC/AMDGPU/mai-gfx950.s | 112 +++++++ .../MC/Disassembler/AMDGPU/gfx950_mai.txt | 61 ++++ llvm/test/tools/llvm-mca/AMDGPU/gfx950.s | 18 ++ 16 files changed, 592 insertions(+), 3 deletions(-) create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll create mode 100644 llvm/test/MC/AMDGPU/mai-gfx950.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt create mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx950.s diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 61516eb2a4a72..6917d8d1aca69 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -431,6 +431,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-conversion- TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-conversion-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-insts") +//===----------------------------------------------------------------------===// +// GFX950 only builtins. +//===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts") + //===----------------------------------------------------------------------===// // GFX12+ only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl index dcdeee6b6acc4..a644a60f9ec38 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -2,6 +2,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950 #pragma OPENCL EXTENSION cl_khr_fp64:enable @@ -222,7 +223,7 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c) #endif // MFMA_GFX90A_TESTS -#ifdef MFMA_GFX940_TESTS +#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) // CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c) @@ -404,4 +405,24 @@ void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, in { *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0); } -#endif // MFMA_GFX940_TESTS +#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) + +#ifdef MFMA_GFX950_TESTS + +// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16( +// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3) + +v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c) +{ + return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3); +} + +// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16 +// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %a, <8 x half> %b, <16 x float> %c, i32 1, i32 2, i32 3) +v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c) +{ + return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3); +} + + +#endif diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl new file mode 100644 index 0000000000000..4c267e2cac5ca --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl @@ -0,0 +1,21 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx950 -verify -S -o - %s + +typedef float float4 __attribute__((ext_vector_type(4))); +typedef float float16 __attribute__((ext_vector_type(16))); +typedef half half8 __attribute__((ext_vector_type(8))); + + +void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) { + + *out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}} +} + + +void test_mfma_f32_32x32x16_f16(__global float16* out, half8 a, half8 b, float16 c, int X) { + *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl new file mode 100644 index 0000000000000..0b3a8e78e1c79 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl @@ -0,0 +1,12 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s + +typedef float float4 __attribute__((ext_vector_type(4))); +typedef float float16 __attribute__((ext_vector_type(16))); +typedef half half8 __attribute__((ext_vector_type(8))); + +void test(__global float4* out0, half8 a0, half8 b0, float4 c0, + __global float16* out1, half8 a1, half8 b1, float16 c1) { + *out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}} + *out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index ed73f0a69e613..ec1234e7bc7d9 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3110,6 +3110,15 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; +//===----------------------------------------------------------------------===// +// gfx950 intrinsics +//===----------------------------------------------------------------------===// + +defset list AMDGPUMFMAIntrinsics950 = { +def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic; +def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic; +} + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 2039d9ccc8116..d3543015d667f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1983,6 +1983,10 @@ def isNotGFX940Plus : Predicate<"!Subtarget->hasGFX940Insts()">, AssemblerPredicate<(all_of (not FeatureGFX940Insts))>; +def HasGFX950Insts : + Predicate<"Subtarget->hasGFX950Insts()">, + AssemblerPredicate<(all_of FeatureGFX950Insts)>; + def isGFX8GFX9NotGFX940 : Predicate<"!Subtarget->hasGFX940Insts() &&" "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 03e57db9c11ce..b648b68f3bd2b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4747,7 +4747,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: - case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: { + case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: + case Intrinsic::amdgcn_mfma_f32_16x16x32_f16: + case Intrinsic::amdgcn_mfma_f32_32x32x16_f16: { // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 60fa2adc62dc8..2ea254e64b8cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -333,6 +333,8 @@ foreach intr = AMDGPUMFMAIntrinsics90A in def : SourceOfDivergence; foreach intr = AMDGPUMFMAIntrinsics940 in def : SourceOfDivergence; +foreach intr = AMDGPUMFMAIntrinsics950 in +def : SourceOfDivergence; foreach intr = AMDGPUWMMAIntrinsicsGFX11 in def : SourceOfDivergence; foreach intr = AMDGPUWMMAIntrinsicsGFX12 in diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index e722e046092fd..4a6efe533230b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1285,6 +1285,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // hasGFX90AInsts is also true. bool hasGFX940Insts() const { return GFX940Insts; } + // GFX950 is a derivation to GFX940. hasGFX950Insts() implies that + // hasGFX940Insts and hasGFX90AInsts are also true. + bool hasGFX950Insts() const { return GFX950Insts; } + bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 71bd5ece32bc4..882e147dc231f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2845,6 +2845,10 @@ def VOP_V16I32_V2I32_V4I32_I32 : VOPProfile <[v16i32, v2i32, v4i32, i32]>; def VOP_V4F32_V2I32_V4I32_I32 : VOPProfile <[v4f32, v2i32, v4i32, i32]>; def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>; +def VOP_V4F32_V8F16_V8F16_V4F32 : VOPProfile <[v4f32, v8f16, v8f16, v4f32]>; +def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>; + + class Commutable_REV { string RevOp = revOp; bit IsOrig = isOrig; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index e246d433401f9..58e26a96ece20 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -626,6 +626,11 @@ def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC; def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC; +def VOPProfileMAI_F32_V8F16_X32 : VOPProfileMAI; +def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V8F16_X16 : VOPProfileMAI; +def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI; + class MFMATable { bit IsMac = is_mac; string FMAOp = Name; @@ -739,6 +744,11 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", } // End SubtargetPredicate = HasMAIInsts +let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in { +defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>; +defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>; +} + let Predicates = [isGFX90APlus] in { let is_gfx940_xdl = 1 in { defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>; @@ -1650,6 +1660,16 @@ multiclass VOP3P_Real_MFMA_gfx940 op, string Name = !cast(N } } +multiclass VOP3P_Real_MFMA_gfx950 op, string Name = !cast(NAME#"_e64").Mnemonic, + VOP3_Pseudo PS_ACD = !cast(NAME # "_e64"), + VOP3_Pseudo PS_VCD = !cast(NAME # "_vgprcd" # "_e64")> { + let SubtargetPredicate = HasGFX950Insts, + AssemblerPredicate = HasGFX950Insts in { + defm "" : VOP3P_Real_MFMA_gfx940; + } +} + + multiclass VOP3P_Real_MFMA_vi op> { def _vi : VOP3P_Real(NAME#"_e64"), SIEncodingFamily.VI>, VOP3Pe_MAI (NAME#"_e64").Pfl, ?> { @@ -1764,6 +1784,8 @@ defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>; defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>; defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>; +defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">; +defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">; defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">; defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">; let SubtargetPredicate = HasXF32Insts in { diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index b215fc2c2ae74..c457d867af361 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -261,6 +261,23 @@ bb: ret void } +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) + +; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0) +define amdgpu_kernel void @mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) { + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0) + store <4 x float> %result, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0) +define amdgpu_kernel void @mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) { + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0) + store <16 x float> %result, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll new file mode 100644 index 0000000000000..88d04e9fb428a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -0,0 +1,274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s + +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) + +; -------------------------------------------------------------------- +; llvm.amdgcn.mfma.f32.16x16x32.f16 +; -------------------------------------------------------------------- + +define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_f32_16x16x32_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { +; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_f32_16x16x32_f16__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { +; GCN-LABEL: test_mfma_f32_16x16x32_f16__mac: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v0 +; GCN-NEXT: v_accvgpr_write_b32 a1, v1 +; GCN-NEXT: v_accvgpr_write_b32 a2, v2 +; GCN-NEXT: v_accvgpr_write_b32 a3, v3 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3] +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @test_mfma_f32_16x16x32_f16___flags__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { +; GCN-LABEL: test_mfma_f32_16x16x32_f16___flags__mac: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v0 +; GCN-NEXT: v_accvgpr_write_b32 a1, v1 +; GCN-NEXT: v_accvgpr_write_b32 a2, v2 +; GCN-NEXT: v_accvgpr_write_b32 a3, v3 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) + ret <4 x float> %result +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.mfma.f32.32x32x16.f16 +; -------------------------------------------------------------------- + +define <16 x float> @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_f32_32x32x16_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { +; GCN-LABEL: test_mfma_f32_32x32x16_f16__flags: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: v_accvgpr_write_b32 a4, v12 +; GCN-NEXT: v_accvgpr_write_b32 a5, v13 +; GCN-NEXT: v_accvgpr_write_b32 a6, v14 +; GCN-NEXT: v_accvgpr_write_b32 a7, v15 +; GCN-NEXT: v_accvgpr_write_b32 a8, v16 +; GCN-NEXT: v_accvgpr_write_b32 a9, v17 +; GCN-NEXT: v_accvgpr_write_b32 a10, v18 +; GCN-NEXT: v_accvgpr_write_b32 a11, v19 +; GCN-NEXT: v_accvgpr_write_b32 a12, v20 +; GCN-NEXT: v_accvgpr_write_b32 a13, v21 +; GCN-NEXT: v_accvgpr_write_b32 a14, v22 +; GCN-NEXT: v_accvgpr_write_b32 a15, v23 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<16 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { +; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v0 +; GCN-NEXT: v_accvgpr_write_b32 a1, v1 +; GCN-NEXT: v_accvgpr_write_b32 a2, v2 +; GCN-NEXT: v_accvgpr_write_b32 a3, v3 +; GCN-NEXT: v_accvgpr_write_b32 a4, v4 +; GCN-NEXT: v_accvgpr_write_b32 a5, v5 +; GCN-NEXT: v_accvgpr_write_b32 a6, v6 +; GCN-NEXT: v_accvgpr_write_b32 a7, v7 +; GCN-NEXT: v_accvgpr_write_b32 a8, v8 +; GCN-NEXT: v_accvgpr_write_b32 a9, v9 +; GCN-NEXT: v_accvgpr_write_b32 a10, v10 +; GCN-NEXT: v_accvgpr_write_b32 a11, v11 +; GCN-NEXT: v_accvgpr_write_b32 a12, v12 +; GCN-NEXT: v_accvgpr_write_b32 a13, v13 +; GCN-NEXT: v_accvgpr_write_b32 a14, v14 +; GCN-NEXT: v_accvgpr_write_b32 a15, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[20:23], a[0:15] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) + ret <16 x float> %result +} + +define <16 x float> @test_mfma_f32_32x32x16_f16__flags__mac(<16 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { +; GCN-LABEL: test_mfma_f32_32x32x16_f16__flags__mac: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v0 +; GCN-NEXT: v_accvgpr_write_b32 a1, v1 +; GCN-NEXT: v_accvgpr_write_b32 a2, v2 +; GCN-NEXT: v_accvgpr_write_b32 a3, v3 +; GCN-NEXT: v_accvgpr_write_b32 a4, v4 +; GCN-NEXT: v_accvgpr_write_b32 a5, v5 +; GCN-NEXT: v_accvgpr_write_b32 a6, v6 +; GCN-NEXT: v_accvgpr_write_b32 a7, v7 +; GCN-NEXT: v_accvgpr_write_b32 a8, v8 +; GCN-NEXT: v_accvgpr_write_b32 a9, v9 +; GCN-NEXT: v_accvgpr_write_b32 a10, v10 +; GCN-NEXT: v_accvgpr_write_b32 a11, v11 +; GCN-NEXT: v_accvgpr_write_b32 a12, v12 +; GCN-NEXT: v_accvgpr_write_b32 a13, v13 +; GCN-NEXT: v_accvgpr_write_b32 a14, v14 +; GCN-NEXT: v_accvgpr_write_b32 a15, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[20:23], a[0:15] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1) + ret <16 x float> %result +} diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s new file mode 100644 index 0000000000000..deba548b6ae8e --- /dev/null +++ b/llvm/test/MC/AMDGPU/mai-gfx950.s @@ -0,0 +1,112 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX950 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck -check-prefix=ERR %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefix=ERR %s + +//===----------------------------------------------------------------------===// +// MFMA opcodes. +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// +// v_mfma_f32_16x16x32_f16 +//===----------------------------------------------------------------------===// + +// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] + +// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32f16 v[0:3], v[0:3], v[0:3], v[0:3] + +// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] + +// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32f16 a[0:3], a[0:3], a[0:3], a[0:3] + +// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], v[0:3], 1.0 + +// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], a[0:3], 1.0 + +// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 + +// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 + +// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 + +// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 ; encoding: [0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 + +// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 + +// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] + +// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], a[0:3], v[4:7] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], a[0:3], v[4:7] + +//===----------------------------------------------------------------------===// +// v_mfma_f32_32x32x16_f16 +//===----------------------------------------------------------------------===// + +// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] + +// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] + +// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16f16 v[0:15], v[0:3], v[0:3], v[0:15] + +// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16f16 a[0:15], a[0:3], a[0:3], a[0:15] + +// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], 1.0 + +// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 1.0 + +// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 + +// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 + +// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 + +// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 + +// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt new file mode 100644 index 0000000000000..68b52672b445d --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt @@ -0,0 +1,61 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s + +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c] +0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c + +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 ; encoding: [0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c] +0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c + +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c] +0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c + +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c] +0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c + +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c] +0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c + +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13] +0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13 + +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04] +0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04 + +# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], a[0:3], v[4:7] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c] +0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c + +# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b] +0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b + +# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04] +0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04 + +# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4] +0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4 + +# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b] +0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b + +# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c] +0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c + +# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c] +0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c + +# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c] +0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c + +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03] +0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03 + +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04] +0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04 + +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04] +0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04 + +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4] +0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4 + +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04] +0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04 diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s new file mode 100644 index 0000000000000..66affe8f930af --- /dev/null +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s @@ -0,0 +1,18 @@ +# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx950 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s + +# CHECK: Iterations: 1 +# CHECK: Instructions: 4 +# CHECK: Total Cycles: 25 +# CHECK: Total uOps: 4 + + +v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] +v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] +v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 + +# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 From f8d1905a24c16bf6db42d428672401156ef6a473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Mon, 18 Nov 2024 22:39:23 +0100 Subject: [PATCH 041/366] [GlobalISel] Combine [S,U]SUBO (#116489) We import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer also builds them while legalizing other opcodes, see narrowScalarAddSub. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 + .../CodeGen/GlobalISel/GenericMachineInstrs.h | 17 + .../include/llvm/Target/GlobalISel/Combine.td | 16 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 75 ++ llvm/lib/Target/AArch64/AArch64Combine.td | 6 +- .../AArch64/GlobalISel/combine-overflow.mir | 101 +++ llvm/test/CodeGen/AArch64/popcount.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 358 ++++---- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 788 +++++++++--------- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 291 ++++--- 10 files changed, 932 insertions(+), 727 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b1232a368a365..55c3b72c8e027 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -939,6 +939,9 @@ class CombinerHelper { // merge_values(_, zero) -> zext bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo); + // overflow sub + bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index 4de14dee190fb..9e5d4d34f24d2 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -486,6 +486,23 @@ class GAddCarryOut : public GBinOpCarryOut { } }; +/// Represents overflowing sub operations. +/// G_USUBO, G_SSUBO +class GSubCarryOut : public GBinOpCarryOut { +public: + bool isSigned() const { return getOpcode() == TargetOpcode::G_SSUBO; } + + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_USUBO: + case TargetOpcode::G_SSUBO: + return true; + default: + return false; + } + } +}; + /// Represents overflowing add/sub operations that also consume a carry-in. /// G_UADDE, G_SADDE, G_USUBE, G_SSUBE class GAddSubCarryInOut : public GAddSubCarryOut { diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index f8379609bf1d9..b0c63fc7c7b80 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1385,6 +1385,12 @@ def match_addos : GICombineRule< [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +def match_subo_no_overflow : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_SSUBO, G_USUBO):$root, + [{ return Helper.matchSuboCarryOut(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + def match_extract_of_element_undef_vector: GICombineRule < (defs root:$root), (match (G_IMPLICIT_DEF $vector), @@ -1901,6 +1907,12 @@ def cmp_combines: GICombineGroup<[ redundant_binop_in_equality ]>; + +def overflow_combines: GICombineGroup<[ + match_addos, + match_subo_no_overflow +]>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -1984,9 +1996,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, and_or_disjoint_mask, fma_combines, fold_binop_into_select, sub_add_reg, select_to_minmax, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, - combine_concat_vector, match_addos, + combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, - combine_use_vector_truncate, merge_combines]>; + combine_use_vector_truncate, merge_combines, overflow_combines]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 83d78c0bde399..d95fc8cfbcf55 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -7790,3 +7790,78 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI, return true; } + +bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI, + BuildFnTy &MatchInfo) { + const GSubCarryOut *Subo = cast(&MI); + + Register Dst = Subo->getReg(0); + Register LHS = Subo->getLHSReg(); + Register RHS = Subo->getRHSReg(); + Register Carry = Subo->getCarryOutReg(); + LLT DstTy = MRI.getType(Dst); + LLT CarryTy = MRI.getType(Carry); + + // Check legality before known bits. + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy}}) || + !isConstantLegalOrBeforeLegalizer(CarryTy)) + return false; + + ConstantRange KBLHS = + ConstantRange::fromKnownBits(KB->getKnownBits(LHS), + /* IsSigned=*/Subo->isSigned()); + ConstantRange KBRHS = + ConstantRange::fromKnownBits(KB->getKnownBits(RHS), + /* IsSigned=*/Subo->isSigned()); + + if (Subo->isSigned()) { + // G_SSUBO + switch (KBLHS.signedSubMayOverflow(KBRHS)) { + case ConstantRange::OverflowResult::MayOverflow: + return false; + case ConstantRange::OverflowResult::NeverOverflows: { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap); + B.buildConstant(Carry, 0); + }; + return true; + } + case ConstantRange::OverflowResult::AlwaysOverflowsLow: + case ConstantRange::OverflowResult::AlwaysOverflowsHigh: { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSub(Dst, LHS, RHS); + B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(), + /*isVector=*/CarryTy.isVector(), + /*isFP=*/false)); + }; + return true; + } + } + return false; + } + + // G_USUBO + switch (KBLHS.unsignedSubMayOverflow(KBRHS)) { + case ConstantRange::OverflowResult::MayOverflow: + return false; + case ConstantRange::OverflowResult::NeverOverflows: { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap); + B.buildConstant(Carry, 0); + }; + return true; + } + case ConstantRange::OverflowResult::AlwaysOverflowsLow: + case ConstantRange::OverflowResult::AlwaysOverflowsHigh: { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSub(Dst, LHS, RHS); + B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(), + /*isVector=*/CarryTy.isVector(), + /*isFP=*/false)); + }; + return true; + } + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 8af8cdfeba6ac..1b1d81fcd07a2 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -322,13 +322,13 @@ def AArch64PostLegalizerCombiner extractvecelt_pairwise_add, redundant_or, mul_const, redundant_sext_inreg, form_bitfield_extract, rotate_out_of_range, - icmp_to_true_false_known_bits, - select_combines, fold_merge_to_zext, + icmp_to_true_false_known_bits, overflow_combines, + select_combines, fold_merge_to_zext, merge_combines, constant_fold_binops, identity_combines, ptr_add_immed_chain, overlapping_and, split_store_zero_128, undef_combines, select_to_minmax, or_to_bsp, combine_concat_vector, - commute_constant_to_rhs, merge_combines, + commute_constant_to_rhs, push_freeze_to_prevent_poison_from_propagating, combine_mul_cmlt, combine_use_vector_truncate]> { } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir index bc4b5ae7c066a..20cba54923548 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir @@ -176,3 +176,104 @@ body: | $q1 = COPY %o_wide RET_ReallyLR implicit $w0 ... +--- +name: sub_may +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: sub_may + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512 + ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_SSUBO [[COPY]], %const + ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %sub(s32) + ; CHECK-NEXT: $w1 = COPY %o_wide(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %const:_(s32) = G_CONSTANT i32 512 + %sub:_(s32), %o:_(s1) = G_SSUBO %0, %const + %o_wide:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %sub(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... +--- +name: usub_may +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: usub_may + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512 + ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_USUBO [[COPY]], %const + ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %sub(s32) + ; CHECK-NEXT: $w1 = COPY %o_wide(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %const:_(s32) = G_CONSTANT i32 512 + %sub:_(s32), %o:_(s1) = G_USUBO %0, %const + %o_wide:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %sub(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... +--- +name: usub_may_carry_s11 +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: usub_may_carry_s11 + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512 + ; CHECK-NEXT: %sub:_(s32), %o:_(s11) = G_USUBO [[COPY]], %const + ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s11) + ; CHECK-NEXT: $w0 = COPY %sub(s32) + ; CHECK-NEXT: $w1 = COPY %o_wide(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %const:_(s32) = G_CONSTANT i32 512 + %sub:_(s32), %o:_(s11) = G_USUBO %0, %const + %o_wide:_(s32) = G_ZEXT %o(s11) + $w0 = COPY %sub(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... +--- +name: usub_may_carry_s11_vector +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: usub_may_carry_s11_vector + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512 + ; CHECK-NEXT: %bv:_(<4 x s32>) = G_BUILD_VECTOR %const(s32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; CHECK-NEXT: %bv1:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), %const(s32) + ; CHECK-NEXT: %sub:_(<4 x s32>), %o:_(<4 x s11>) = G_USUBO %bv, %bv1 + ; CHECK-NEXT: %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s11>) + ; CHECK-NEXT: $q0 = COPY %sub(<4 x s32>) + ; CHECK-NEXT: $q1 = COPY %o_wide(<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w0 + %2:_(s32) = COPY $w0 + %3:_(s32) = COPY $w0 + %const:_(s32) = G_CONSTANT i32 512 + %bv:_(<4 x s32>) = G_BUILD_VECTOR %const(s32), %0(s32), %1(s32), %2(s32) + %bv1:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %const(s32) + %sub:_(<4 x s32>), %o:_(<4 x s11>) = G_USUBO %bv, %bv1 + %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s11>) + $q0 = COPY %sub(<4 x s32>) + $q1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll index 1fc4de1c48b7d..f9f1cd4b1fcf7 100644 --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -113,9 +113,9 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) { ; ; GISEL-LABEL: popcount256: ; GISEL: // %bb.0: // %Entry -; GISEL-NEXT: ldp x8, x9, [x0, #16] +; GISEL-NEXT: ldp x8, x9, [x0] ; GISEL-NEXT: mov v0.d[0], x8 -; GISEL-NEXT: ldp x8, x10, [x0] +; GISEL-NEXT: ldp x8, x10, [x0, #16] ; GISEL-NEXT: mov v1.d[0], x8 ; GISEL-NEXT: mov v0.d[1], x9 ; GISEL-NEXT: mov v1.d[1], x10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 613c73f7b9368..14b30e0d79946 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -1178,212 +1178,212 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: s_subb_u32 s7, 0, 0 +; GISEL-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v6, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v6, v5, v8 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10] -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v8 +; GISEL-NEXT: v_trunc_f32_e32 v8, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mov_b32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v7, v6, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v0, v12, v8 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, v14 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v12, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v13, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v13, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v17, v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v9, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] -; GISEL-NEXT: s_subb_u32 s7, 0, 0 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v5, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v7, v6 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8] -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v2 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v6 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1392,23 +1392,23 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index d5e22df59ccb3..ee7a040e41fd5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1106,210 +1106,210 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 +; GISEL-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v6, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v6, v5, v8 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10] -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v8 +; GISEL-NEXT: v_trunc_f32_e32 v8, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mov_b32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v7, v6, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v0, v12, v8 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, v14 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v8, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v13, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v4 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v1, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9] -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v5, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v7, v6 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8] -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v2 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v6 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1321,18 +1321,18 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1699,210 +1699,210 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: s_subb_u32 s7, 0, 0 +; GISEL-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v6, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v6, v5, v8 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10] -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v8 +; GISEL-NEXT: v_trunc_f32_e32 v8, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mov_b32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v7, v6, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v0, v12, v8 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, v14 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v8, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v13, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v4 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v1, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9] -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v5, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v7, v6 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8] -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v2 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v6 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1914,18 +1914,18 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -3194,59 +3194,59 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v4 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 0, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v0 ; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v5 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v14, v4 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v3, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v17, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v8, v5 +; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v0, v6 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1] -; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v1 +; GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[1:2] +; GISEL-NEXT: v_subrev_i32_e32 v1, vcc, 0, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 @@ -3274,7 +3274,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0, v3 ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index a7e5ce3d21619..faad7e93da5d3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1095,192 +1095,189 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: s_sub_u32 s4, 0, 0x12d8fb -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; GISEL-NEXT: s_mov_b32 s4, 1 +; GISEL-NEXT: v_mov_b32_e32 v5, 0xffed2705 +; GISEL-NEXT: s_mov_b32 s5, 1 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GISEL-NEXT: s_subb_u32 s4, 0, 0 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_subb_u32 s5, 0, 0 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, s4, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, s6, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s5, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, s4, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, s7, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, s6, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v7 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 -; GISEL-NEXT: v_mul_lo_u32 v18, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v20, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v6, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, s4, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, s5, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v6, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, s4, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, s5, v9 -; GISEL-NEXT: v_mul_hi_u32 v13, s4, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, s6, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, s7, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, s6, v5 -; GISEL-NEXT: v_mul_lo_u32 v15, s4, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, s6, v6 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v20, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v18, v5, v12 -; GISEL-NEXT: v_mul_lo_u32 v21, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v11 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v7, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5 +; GISEL-NEXT: v_mul_lo_u32 v13, s4, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s5, v6 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v5 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v20, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v9 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v5 +; GISEL-NEXT: v_mul_lo_u32 v21, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v22, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v21, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v22 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v16 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v18 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v16, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v17, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v18, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v18, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v16 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v17, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v17, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v18 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v17, v8 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v9, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v5, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v6, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v7, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v1, v7, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v12 +; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v8 ; GISEL-NEXT: v_subb_u32_e64 v8, vcc, v3, v5, s[6:7] ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 From 31aa7f34e07c901773993dac0f33568307f96da6 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Mon, 18 Nov 2024 13:41:54 -0800 Subject: [PATCH 042/366] [mlir][Affine] Let affine.[de]linearize_index omit outer bounds (#116103) The affine.delinearize_index and affine.linearize_index operations, as currently defined, require providing a length N basis to [de]linearize N values. The first value in this basis is never used during lowering and is unused during lowering. (Note that, even though it isn't used during lowering it can still be used to, for example, remove length-1 outputs from a delinearize). This dead value makes sense in the original context of these operations, which is linearizing or de-linearizing indexes to memref<>s, vector<>s, and other shaped types, where that outer bound is avaliable and may be useful for analysis. However, other usecases exist where the outer bound is not known. For example: %thread_id_x = gpu.thread_id x : index %0:3 = affine.delinearize_index %thread_id_x into (4, 16) : index,index, index In this code, we don't know the upper bound of the thread ID, but we do want to construct the ?x4x16 grid of delinearized values in order to further partition the GPU threads. In order to support such usecases, we broaden the definition of affine.delinearize_index and affine.linearize_index to make the outer bound optional. In the case of affine.delinearize_index, where the number of results is a function of the size of the passed-in basis, we augment all existing builders with a `hasOuterBound` argument, which, for backwards compatibilty and to preserve the natural usage of the op, defaults to `true`. If this flag is true, the op returns one result per basis element, if it is false, it returns one extra result in position 0. We also update existing canonicalization patterns (and move one of them into the folder) to handle these cases. Note that disagreements about the outer bound now no longer prevent delinearize/linearize cancelations. --- .../mlir/Dialect/Affine/IR/AffineOps.td | 64 +++++- mlir/include/mlir/Dialect/Affine/Utils.h | 14 +- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 209 +++++++++++------- .../Transforms/AffineExpandIndexOps.cpp | 11 +- mlir/lib/Dialect/Affine/Utils/Utils.cpp | 24 +- .../Affine/affine-expand-index-ops.mlir | 13 +- mlir/test/Dialect/Affine/canonicalize.mlir | 172 ++++++++++++++ mlir/test/Dialect/Affine/invalid.mlir | 18 +- mlir/test/python/dialects/affine.py | 4 +- 9 files changed, 402 insertions(+), 127 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index ea65911af43a1..76d97f106dcb8 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -1060,8 +1060,7 @@ def AffineVectorStoreOp : AffineStoreOpBase<"vector_store"> { // AffineDelinearizeIndexOp //===----------------------------------------------------------------------===// -def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", - [Pure, DeclareOpInterfaceMethods]> { +def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> { let summary = "delinearize an index"; let description = [{ The `affine.delinearize_index` operation takes a single index value and @@ -1083,6 +1082,25 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", %indices_1 = affine.apply #map1()[%linear_index] %indices_2 = affine.apply #map2()[%linear_index] ``` + + The basis may either contain `N` or `N-1` elements, where `N` is the number of results. + If there are N basis elements, the first one will not be used during computations, + but may be used during analysis and canonicalization to eliminate terms from + the `affine.delinearize_index` or to enable conclusions about the total size of + `%linear_index`. + + If the basis is fully provided, the delinearize_index operation is said to "have + an outer bound". The builders assume that an `affine.delinearize_index` has + an outer bound by default, as this is how the operation was initially defined. + + That is, the example above could also have been written + ```mlir + %0:3 = affine.delinearize_index %linear_index into (244, 244) : index, index + ``` + + Note that, due to the constraints of affine maps, all the basis elements must + be strictly positive. A dynamic basis element being 0 or negative causes + undefined behavior. }]; let arguments = (ins Index:$linear_index, @@ -1097,17 +1115,27 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", }]; let builders = [ - OpBuilder<(ins "Value":$linear_index, "ValueRange":$basis)>, - OpBuilder<(ins "Value":$linear_index, "ArrayRef":$basis)>, - OpBuilder<(ins "Value":$linear_index, "ArrayRef":$basis)> + OpBuilder<(ins "Value":$linear_index, "ValueRange":$dynamic_basis, "ArrayRef":$static_asis, CArg<"bool", "true">:$hasOuterBound)>, + OpBuilder<(ins "Value":$linear_index, "ValueRange":$basis, CArg<"bool", "true">:$hasOuterBound)>, + OpBuilder<(ins "Value":$linear_index, "ArrayRef":$basis, CArg<"bool", "true">:$hasOuterBound)>, + OpBuilder<(ins "Value":$linear_index, "ArrayRef":$basis, CArg<"bool", "true">:$hasOuterBound)> ]; let extraClassDeclaration = [{ + /// Return true if the basis includes a bound on the first index input. + bool hasOuterBound() { + return getMultiIndex().size() == getStaticBasis().size(); + } + /// Returns a vector with all the static and dynamic basis values. SmallVector getMixedBasis() { OpBuilder builder(getContext()); return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder); } + + /// Return a vector that contains the basis of the operation, removing + /// the outer bound if one is present. + SmallVector getEffectiveBasis(); }]; let hasVerifier = 1; @@ -1125,13 +1153,21 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", The `affine.linearize_index` operation takes a sequence of index values and a basis of the same length and linearizes the indices using that basis. - That is, for indices `%idx_1` through `%idx_N` and basis elements `b_1` through `b_N`, - it computes + That is, for indices `%idx_0` to `%idx_{N-1}` and basis elements `b_0` + (or `b_1`) up to `b_{N-1}` it computes ``` - sum(i = 1 to N) %idx_i * product(j = i + 1 to N) B_j + sum(i = 0 to N-1) %idx_i * product(j = i + 1 to N-1) B_j ``` + The basis may either have `N` or `N-1` elements, where `N` is the number of + inputs to linearize_index. If `N` inputs are provided, the first one is not used + in computation, but may be used during analysis or canonicalization as a bound + on `%idx_0`. + + If all `N` basis elements are provided, the linearize_index operation is said to + "have an outer bound". + If the `disjoint` property is present, this is an optimization hint that, for all `i`, `0 <= %idx_i < B_i` - that is, no index affects any other index, except that `%idx_0` may be negative to make the index as a whole negative. @@ -1141,7 +1177,9 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", Example: ```mlir - %linear_index = affine.linearize_index [%index_0, %index_1, %index_2] (2, 3, 5) : index + %linear_index = affine.linearize_index [%index_0, %index_1, %index_2] by (2, 3, 5) : index + // Same effect + %linear_index = affine.linearize_index [%index_0, %index_1, %index_2] by (3, 5) : index ``` In the above example, `%linear_index` conceptually holds the following: @@ -1172,12 +1210,20 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", ]; let extraClassDeclaration = [{ + /// Return true if the basis includes a bound on the first index input. + bool hasOuterBound() { + return getMultiIndex().size() == getStaticBasis().size(); + } + /// Return a vector with all the static and dynamic basis values. SmallVector getMixedBasis() { OpBuilder builder(getContext()); return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder); } + /// Return a vector that contains the basis of the operation, removing + /// the outer bound if one is present. + SmallVector getEffectiveBasis(); }]; let hasVerifier = 1; diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h index 0e98223969e08..0f801ebb6f589 100644 --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -307,17 +307,23 @@ struct DivModValue { DivModValue getDivMod(OpBuilder &b, Location loc, Value lhs, Value rhs); /// Generate the IR to delinearize `linearIndex` given the `basis` and return -/// the multi-index. +/// the multi-index. `hasOuterBound` indicates whether `basis` has an entry +/// given the size of the first multi-index result - if it is true, the function +/// will return `basis.size()` values, otherwise, it will return `basis.size() + +/// 1`. FailureOr> delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex, - ArrayRef basis); + ArrayRef basis, + bool hasOuterBound = true); FailureOr> delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex, - ArrayRef basis); + ArrayRef basis, + bool hasOuterBound = true); // Generate IR that extracts the linear index from a multi-index according to -// a basis/shape. +// a basis/shape. The basis may contain either `multiIndex.size()` or +// `multiIndex.size() - 1` elements. OpFoldResult linearizeIndex(ArrayRef multiIndex, ArrayRef basis, ImplicitLocOpBuilder &builder); diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index fbc9053a0e273..4cf07bc167eab 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -20,6 +20,7 @@ #include "mlir/Interfaces/ShapedOpInterfaces.h" #include "mlir/Interfaces/ValueBoundsOpInterface.h" #include "mlir/Transforms/InliningUtils.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVectorExtras.h" @@ -4503,62 +4504,81 @@ LogicalResult AffineVectorStoreOp::verify() { // DelinearizeIndexOp //===----------------------------------------------------------------------===// -LogicalResult AffineDelinearizeIndexOp::inferReturnTypes( - MLIRContext *context, std::optional<::mlir::Location> location, - ValueRange operands, DictionaryAttr attributes, OpaqueProperties properties, - RegionRange regions, SmallVectorImpl &inferredReturnTypes) { - AffineDelinearizeIndexOpAdaptor adaptor(operands, attributes, properties, - regions); - inferredReturnTypes.assign(adaptor.getStaticBasis().size(), - IndexType::get(context)); - return success(); +void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder, + OperationState &odsState, + Value linearIndex, ValueRange dynamicBasis, + ArrayRef staticBasis, + bool hasOuterBound) { + SmallVector returnTypes(hasOuterBound ? staticBasis.size() + : staticBasis.size() + 1, + linearIndex.getType()); + build(odsBuilder, odsState, returnTypes, linearIndex, dynamicBasis, + staticBasis); } void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState, - Value linearIndex, ValueRange basis) { + Value linearIndex, ValueRange basis, + bool hasOuterBound) { SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis, staticBasis); - build(odsBuilder, odsState, linearIndex, dynamicBasis, staticBasis); + build(odsBuilder, odsState, linearIndex, dynamicBasis, staticBasis, + hasOuterBound); } void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState, Value linearIndex, - ArrayRef basis) { + ArrayRef basis, + bool hasOuterBound) { SmallVector dynamicBasis; SmallVector staticBasis; dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis); - build(odsBuilder, odsState, linearIndex, dynamicBasis, staticBasis); + build(odsBuilder, odsState, linearIndex, dynamicBasis, staticBasis, + hasOuterBound); } void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState, - Value linearIndex, - ArrayRef basis) { - build(odsBuilder, odsState, linearIndex, ValueRange{}, basis); + Value linearIndex, ArrayRef basis, + bool hasOuterBound) { + build(odsBuilder, odsState, linearIndex, ValueRange{}, basis, hasOuterBound); } LogicalResult AffineDelinearizeIndexOp::verify() { - if (getStaticBasis().empty()) - return emitOpError("basis should not be empty"); - if (getNumResults() != getStaticBasis().size()) - return emitOpError("should return an index for each basis element"); - auto dynamicMarkersCount = - llvm::count_if(getStaticBasis(), ShapedType::isDynamic); + ArrayRef staticBasis = getStaticBasis(); + if (getNumResults() != staticBasis.size() && + getNumResults() != staticBasis.size() + 1) + return emitOpError("should return an index for each basis element and up " + "to one extra index"); + + auto dynamicMarkersCount = llvm::count_if(staticBasis, ShapedType::isDynamic); if (static_cast(dynamicMarkersCount) != getDynamicBasis().size()) return emitOpError( "mismatch between dynamic and static basis (kDynamic marker but no " "corresponding dynamic basis entry) -- this can only happen due to an " "incorrect fold/rewrite"); + + if (!llvm::all_of(staticBasis, [](int64_t v) { + return v > 0 || ShapedType::isDynamic(v); + })) + return emitOpError("no basis element may be statically non-positive"); + return success(); } LogicalResult AffineDelinearizeIndexOp::fold(FoldAdaptor adaptor, SmallVectorImpl &result) { + // If we won't be doing any division or modulo (no basis or the one basis + // element is purely advisory), simply return the input value. + if (getNumResults() == 1) { + result.push_back(getLinearIndex()); + return success(); + } + if (adaptor.getLinearIndex() == nullptr) return failure(); @@ -4567,7 +4587,11 @@ AffineDelinearizeIndexOp::fold(FoldAdaptor adaptor, int64_t highPart = cast(adaptor.getLinearIndex()).getInt(); Type attrType = getLinearIndex().getType(); - for (int64_t modulus : llvm::reverse(getStaticBasis().drop_front())) { + + ArrayRef staticBasis = getStaticBasis(); + if (hasOuterBound()) + staticBasis = staticBasis.drop_front(); + for (int64_t modulus : llvm::reverse(staticBasis)) { result.push_back(IntegerAttr::get(attrType, llvm::mod(highPart, modulus))); highPart = llvm::divideFloorSigned(highPart, modulus); } @@ -4576,6 +4600,20 @@ AffineDelinearizeIndexOp::fold(FoldAdaptor adaptor, return success(); } +SmallVector AffineDelinearizeIndexOp::getEffectiveBasis() { + OpBuilder builder(getContext()); + if (hasOuterBound()) { + if (getStaticBasis().front() == ::mlir::ShapedType::kDynamic) + return getMixedValues(getStaticBasis().drop_front(), + getDynamicBasis().drop_front(), builder); + + return getMixedValues(getStaticBasis().drop_front(), getDynamicBasis(), + builder); + } + + return getMixedValues(getStaticBasis(), getDynamicBasis(), builder); +} + namespace { // Drops delinearization indices that correspond to unit-extent basis @@ -4594,24 +4632,25 @@ struct DropUnitExtentBasis return zero.value(); }; + bool hasOuterBound = delinearizeOp.hasOuterBound(); // Replace all indices corresponding to unit-extent basis with 0. // Remaining basis can be used to get a new `affine.delinearize_index` op. - SmallVector newOperands; + SmallVector newBasis; for (auto [index, basis] : llvm::enumerate(delinearizeOp.getMixedBasis())) { std::optional basisVal = getConstantIntValue(basis); if (basisVal && *basisVal == 1) - replacements[index] = getZero(); + replacements[index + (hasOuterBound ? 0 : 1)] = getZero(); else - newOperands.push_back(basis); + newBasis.push_back(basis); } - if (newOperands.size() == delinearizeOp.getStaticBasis().size()) + if (newBasis.size() == delinearizeOp.getStaticBasis().size()) return rewriter.notifyMatchFailure(delinearizeOp, "no unit basis elements"); - if (!newOperands.empty()) { + if (!newBasis.empty() || !hasOuterBound) { auto newDelinearizeOp = rewriter.create( - loc, delinearizeOp.getLinearIndex(), newOperands); + loc, delinearizeOp.getLinearIndex(), newBasis, hasOuterBound); int newIndex = 0; // Map back the new delinearized indices to the values they replace. for (auto &replacement : replacements) { @@ -4626,27 +4665,6 @@ struct DropUnitExtentBasis } }; -/// Drop delinearization with a single basis element -/// -/// By definition, `delinearize_index %linear into (%basis)` is -/// `%linear floorDiv 1` (since `1` is the product of the basis elememts, -/// ignoring the 0th one, and since there is no previous division we need -/// to use the remainder of). Therefore, a single-element `delinearize` -/// can be replaced by the underlying linear index. -struct DropDelinearizeOneBasisElement - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(affine::AffineDelinearizeIndexOp delinearizeOp, - PatternRewriter &rewriter) const override { - if (delinearizeOp.getStaticBasis().size() != 1) - return rewriter.notifyMatchFailure(delinearizeOp, - "doesn't have a length-1 basis"); - rewriter.replaceOp(delinearizeOp, delinearizeOp.getLinearIndex()); - return success(); - } -}; - /// If a `affine.delinearize_index`'s input is a `affine.linearize_index /// disjoint` and the two operations have the same basis, replace the /// delinearizeation results with the inputs of the `affine.linearize_index` @@ -4668,7 +4686,7 @@ struct CancelDelinearizeOfLinearizeDisjointExact "index doesn't come from linearize"); if (!linearizeOp.getDisjoint() || - linearizeOp.getMixedBasis() != delinearizeOp.getMixedBasis()) + linearizeOp.getEffectiveBasis() != delinearizeOp.getEffectiveBasis()) return rewriter.notifyMatchFailure( linearizeOp, "not disjoint or basis doesn't match delinearize"); @@ -4680,8 +4698,9 @@ struct CancelDelinearizeOfLinearizeDisjointExact void affine::AffineDelinearizeIndexOp::getCanonicalizationPatterns( RewritePatternSet &patterns, MLIRContext *context) { - patterns.insert(context); + patterns + .insert( + context); } //===----------------------------------------------------------------------===// @@ -4718,11 +4737,11 @@ void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder, } LogicalResult AffineLinearizeIndexOp::verify() { - if (getStaticBasis().empty()) - return emitOpError("basis should not be empty"); - - if (getMultiIndex().size() != getStaticBasis().size()) - return emitOpError("should be passed an index for each basis element"); + size_t numIndexes = getMultiIndex().size(); + size_t numBasisElems = getStaticBasis().size(); + if (numIndexes != numBasisElems && numIndexes != numBasisElems + 1) + return emitOpError("should be passed a basis element for each index except " + "possibly the first"); auto dynamicMarkersCount = llvm::count_if(getStaticBasis(), ShapedType::isDynamic); @@ -4736,6 +4755,14 @@ LogicalResult AffineLinearizeIndexOp::verify() { } OpFoldResult AffineLinearizeIndexOp::fold(FoldAdaptor adaptor) { + // No indices linearizes to zero. + if (getMultiIndex().empty()) + return IntegerAttr::get(getResult().getType(), 0); + + // One single index linearizes to itself. + if (getMultiIndex().size() == 1) + return getMultiIndex().front(); + if (llvm::any_of(adaptor.getMultiIndex(), [](Attribute a) { return a == nullptr; })) return nullptr; @@ -4745,16 +4772,35 @@ OpFoldResult AffineLinearizeIndexOp::fold(FoldAdaptor adaptor) { int64_t result = 0; int64_t stride = 1; - for (auto [indexAttr, length] : - llvm::zip_equal(llvm::reverse(adaptor.getMultiIndex()), - llvm::reverse(getStaticBasis()))) { + for (auto [length, indexAttr] : + llvm::zip_first(llvm::reverse(getStaticBasis()), + llvm::reverse(adaptor.getMultiIndex()))) { result = result + cast(indexAttr).getInt() * stride; stride = stride * length; } + // Handle the index element with no basis element. + if (!hasOuterBound()) + result = + result + + cast(adaptor.getMultiIndex().front()).getInt() * stride; return IntegerAttr::get(getResult().getType(), result); } +SmallVector AffineLinearizeIndexOp::getEffectiveBasis() { + OpBuilder builder(getContext()); + if (hasOuterBound()) { + if (getStaticBasis().front() == ::mlir::ShapedType::kDynamic) + return getMixedValues(getStaticBasis().drop_front(), + getDynamicBasis().drop_front(), builder); + + return getMixedValues(getStaticBasis().drop_front(), getDynamicBasis(), + builder); + } + + return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder); +} + namespace { /// Rewrite `affine.linearize_index disjoint [%...a, %x, %...b] by (%...c, 1, /// %...d)` to `affine.linearize_index disjoint [%...a, %...b] by (%...c, @@ -4772,14 +4818,20 @@ struct DropLinearizeUnitComponentsIfDisjointOrZero final LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp op, PatternRewriter &rewriter) const override { - size_t numIndices = op.getMultiIndex().size(); + ValueRange multiIndex = op.getMultiIndex(); + size_t numIndices = multiIndex.size(); SmallVector newIndices; newIndices.reserve(numIndices); SmallVector newBasis; newBasis.reserve(numIndices); + if (!op.hasOuterBound()) { + newIndices.push_back(multiIndex.front()); + multiIndex = multiIndex.drop_front(); + } + SmallVector basis = op.getMixedBasis(); - for (auto [index, basisElem] : llvm::zip_equal(op.getMultiIndex(), basis)) { + for (auto [index, basisElem] : llvm::zip_equal(multiIndex, basis)) { std::optional basisEntry = getConstantIntValue(basisElem); if (!basisEntry || *basisEntry != 1) { newIndices.push_back(index); @@ -4808,23 +4860,6 @@ struct DropLinearizeUnitComponentsIfDisjointOrZero final } }; -/// Rewrite `affine.linearize_index [%%x] by (%b)`, into `%x`. -/// -/// By definition, that operation is `affine.apply affine_map<()[s0] -> (s0)>,` -/// which is the identity. -struct DropLinearizeOneBasisElement final - : OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp op, - PatternRewriter &rewriter) const override { - if (op.getStaticBasis().size() != 1 || op.getMultiIndex().size() != 1) - return rewriter.notifyMatchFailure(op, "doesn't have a a length-1 basis"); - rewriter.replaceOp(op, op.getMultiIndex().front()); - return success(); - } -}; - /// Cancel out linearize_index(delinearize_index(x, B), B). /// /// That is, rewrite @@ -4847,10 +4882,10 @@ struct CancelLinearizeOfDelinearizeExact final return rewriter.notifyMatchFailure( linearizeOp, "last entry doesn't come from a delinearize"); - if (linearizeOp.getMixedBasis() != delinearizeOp.getMixedBasis()) + if (linearizeOp.getEffectiveBasis() != delinearizeOp.getEffectiveBasis()) return rewriter.notifyMatchFailure( - linearizeOp, - "basis of linearize and delinearize don't match exactly"); + linearizeOp, "basis of linearize and delinearize don't match exactly " + "(excluding outer bounds)"); if (delinearizeOp.getResults() != linearizeOp.getMultiIndex()) return rewriter.notifyMatchFailure( @@ -4881,9 +4916,12 @@ struct DropLinearizeLeadingZero final } SmallVector mixedBasis = op.getMixedBasis(); + ArrayRef newMixedBasis = mixedBasis; + if (op.hasOuterBound()) + newMixedBasis = newMixedBasis.drop_front(); + rewriter.replaceOpWithNewOp( - op, op.getMultiIndex().drop_front(), - ArrayRef(mixedBasis).drop_front(), op.getDisjoint()); + op, op.getMultiIndex().drop_front(), newMixedBasis, op.getDisjoint()); return success(); } }; @@ -4892,7 +4930,6 @@ struct DropLinearizeLeadingZero final void affine::AffineLinearizeIndexOp::getCanonicalizationPatterns( RewritePatternSet &patterns, MLIRContext *context) { patterns.add(context); } diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp index 1930e987a33ff..15478e0e1e3a5 100644 --- a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp @@ -36,8 +36,9 @@ struct LowerDelinearizeIndexOps using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(AffineDelinearizeIndexOp op, PatternRewriter &rewriter) const override { - FailureOr> multiIndex = delinearizeIndex( - rewriter, op->getLoc(), op.getLinearIndex(), op.getMixedBasis()); + FailureOr> multiIndex = + delinearizeIndex(rewriter, op->getLoc(), op.getLinearIndex(), + op.getEffectiveBasis(), /*hasOuterBound=*/false); if (failed(multiIndex)) return failure(); rewriter.replaceOp(op, *multiIndex); @@ -51,6 +52,12 @@ struct LowerLinearizeIndexOps final : OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(AffineLinearizeIndexOp op, PatternRewriter &rewriter) const override { + // Should be folded away, included here for safety. + if (op.getMultiIndex().empty()) { + rewriter.replaceOpWithNewOp(op, 0); + return success(); + } + SmallVector multiIndex = getAsOpFoldResult(op.getMultiIndex()); OpFoldResult linearIndex = diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp index 7fe422f75c8fa..3420db771ef42 100644 --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -1944,11 +1944,14 @@ static FailureOr composedAffineMultiply(OpBuilder &b, FailureOr> mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex, - ArrayRef basis) { + ArrayRef basis, bool hasOuterBound) { + if (hasOuterBound) + basis = basis.drop_front(); + // Note: the divisors are backwards due to the scan. SmallVector divisors; OpFoldResult basisProd = b.getIndexAttr(1); - for (OpFoldResult basisElem : llvm::reverse(basis.drop_front())) { + for (OpFoldResult basisElem : llvm::reverse(basis)) { FailureOr nextProd = composedAffineMultiply(b, loc, basisElem, basisProd); if (failed(nextProd)) @@ -1971,11 +1974,15 @@ mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex, FailureOr> mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex, - ArrayRef basis) { + ArrayRef basis, + bool hasOuterBound) { + if (hasOuterBound) + basis = basis.drop_front(); + // Note: the divisors are backwards due to the scan. SmallVector divisors; OpFoldResult basisProd = b.getIndexAttr(1); - for (OpFoldResult basisElem : llvm::reverse(basis.drop_front())) { + for (OpFoldResult basisElem : llvm::reverse(basis)) { FailureOr nextProd = composedAffineMultiply(b, loc, basisElem, basisProd); if (failed(nextProd)) @@ -2005,8 +2012,15 @@ OpFoldResult mlir::affine::linearizeIndex(ArrayRef multiIndex, OpFoldResult mlir::affine::linearizeIndex(OpBuilder &builder, Location loc, ArrayRef multiIndex, ArrayRef basis) { - assert(multiIndex.size() == basis.size()); + assert(multiIndex.size() == basis.size() || + multiIndex.size() == basis.size() + 1); SmallVector basisAffine; + + // Add a fake initial size in order to make the later index linearization + // computations line up if an outer bound is not provided. + if (multiIndex.size() == basis.size() + 1) + basisAffine.push_back(getAffineConstantExpr(1, builder.getContext())); + for (size_t i = 0; i < basis.size(); ++i) { basisAffine.push_back(getAffineSymbolExpr(i, builder.getContext())); } diff --git a/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir b/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir index ded1687ca560b..650555cfb5fe1 100644 --- a/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir +++ b/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir @@ -35,10 +35,10 @@ func.func @dynamic_basis(%linear_index: index, %src: memref) -> (inde %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %b0 = memref.dim %src, %c0 : memref %b1 = memref.dim %src, %c1 : memref %b2 = memref.dim %src, %c2 : memref - %1:3 = affine.delinearize_index %linear_index into (%b0, %b1, %b2) : index, index, index + // Note: no outer bound. + %1:3 = affine.delinearize_index %linear_index into (%b1, %b2) : index, index, index return %1#0, %1#1, %1#2 : index, index, index } @@ -60,10 +60,11 @@ func.func @linearize_static(%arg0: index, %arg1: index, %arg2: index) -> index { // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (s1 * s2 + s3 + s0 * (s2 * s4))> // CHECK-LABEL: @linearize_dynamic -// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index, %[[arg4:.+]]: index, %[[arg5:.+]]: index) -// CHECK: %[[val_0:.+]] = affine.apply #[[$map0]]()[%[[arg0]], %[[arg1]], %[[arg5]], %[[arg2]], %[[arg4]]] +// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index, %[[arg4:.+]]: index) +// CHECK: %[[val_0:.+]] = affine.apply #[[$map0]]()[%[[arg0]], %[[arg1]], %[[arg4]], %[[arg2]], %[[arg3]]] // CHECK: return %[[val_0]] -func.func @linearize_dynamic(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> index { - %0 = affine.linearize_index [%arg0, %arg1, %arg2] by (%arg3, %arg4, %arg5) : index +func.func @linearize_dynamic(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> index { + // Note: no outer bounds + %0 = affine.linearize_index [%arg0, %arg1, %arg2] by (%arg3, %arg4) : index func.return %0 : index } diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index ec00b31258d07..b54a13cffe777 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -1496,6 +1496,20 @@ func.func @delinearize_fold_negative_constant() -> (index, index, index) { // ----- +// CHECK-LABEL: @delinearize_fold_negative_constant_no_outer_bound +// CHECK-DAG: %[[C_2:.+]] = arith.constant -2 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-NOT: affine.delinearize_index +// CHECK: return %[[C_2]], %[[C1]], %[[C3]] +func.func @delinearize_fold_negative_constant_no_outer_bound() -> (index, index, index) { + %c_22 = arith.constant -22 : index + %0:3 = affine.delinearize_index %c_22 into (3, 5) : index, index, index + return %0#0, %0#1, %0#2 : index, index, index +} + +// ----- + // CHECK-LABEL: @delinearize_dont_fold_constant_dynamic_basis // CHECK-DAG: %[[C22:.+]] = arith.constant 22 : index // CHECK: %[[RET:.+]]:3 = affine.delinearize_index %[[C22]] @@ -1525,6 +1539,23 @@ func.func @drop_unit_basis_in_delinearize(%arg0 : index, %arg1 : index, %arg2 : // ----- +func.func @drop_unit_basis_in_delinearize_no_outer_bound(%arg0 : index, %arg1 : index, %arg2 : index) -> + (index, index, index, index, index, index) { + %c1 = arith.constant 1 : index + %0:6 = affine.delinearize_index %arg0 into (%arg1, 1, 1, %arg2, %c1) + : index, index, index, index, index, index + return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : index, index, index, index, index, index +} +// CHECK-LABEL: func @drop_unit_basis_in_delinearize_no_outer_bound( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[DELINEARIZE:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], %[[ARG2]]) +// CHECK: return %[[DELINEARIZE]]#0, %[[DELINEARIZE]]#1, %[[C0]], %[[C0]], %[[DELINEARIZE]]#2, %[[C0]] + +// ----- + func.func @drop_all_unit_bases(%arg0 : index) -> (index, index) { %0:2 = affine.delinearize_index %arg0 into (1, 1) : index, index return %0#0, %0#1 : index, index @@ -1537,6 +1568,18 @@ func.func @drop_all_unit_bases(%arg0 : index) -> (index, index) { // ----- +func.func @drop_all_unit_bases_no_outer_bound(%arg0 : index) -> (index, index, index) { + %0:3 = affine.delinearize_index %arg0 into (1, 1) : index, index, index + return %0#0, %0#1, %0#2 : index, index, index +} +// CHECK-LABEL: func @drop_all_unit_bases_no_outer_bound( +// CHECK-SAME: %[[ARG0:.+]]: index) +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-NOT: affine.delinearize_index +// CHECK: return %[[ARG0]], %[[C0]], %[[C0]] + +// ----- + func.func @drop_single_loop_delinearize(%arg0 : index, %arg1 : index) -> index { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -1574,6 +1617,17 @@ func.func @delinearize_non_loop_like(%arg0: memref, %i : index) -> index // ----- +// CHECK-LABEL: func @delinearize_empty_basis +// CHECK-SAME: (%[[ARG0:.+]]: index) +// CHECK-NOT: affine.delinearize +// CHECK: return %[[ARG0]] +func.func @delinearize_empty_basis(%arg0: index) -> index { + %0 = affine.delinearize_index %arg0 into () : index + return %0 : index +} + +// ----- + // CHECK-LABEL: @linearize_fold_constants // CHECK-DAG: %[[C22:.+]] = arith.constant 22 : index // CHECK-NOT: affine.linearize @@ -1588,6 +1642,42 @@ func.func @linearize_fold_constants() -> index { // ----- +// CHECK-LABEL: @linearize_fold_constants_no_outer_bound +// CHECK-DAG: %[[C22:.+]] = arith.constant 22 : index +// CHECK-NOT: affine.linearize +// CHECK: return %[[C22]] +func.func @linearize_fold_constants_no_outer_bound() -> index { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + + %ret = affine.linearize_index [%c1, %c1, %c2] by (3, 5) : index + return %ret : index +} + +// ----- + +// CHECK-LABEL: @linearize_fold_empty_basis +// CHECK-SAME: (%[[ARG0:.+]]: index) +// CHECK-NOT: affine.linearize +// CHECK: return %[[ARG0]] +func.func @linearize_fold_empty_basis(%arg0: index) -> index { + %ret = affine.linearize_index [%arg0] by () : index + return %ret : index +} + +// ----- + +// CHECK-LABEL: @linearize_fold_only_outer_bound +// CHECK-SAME: (%[[ARG0:.+]]: index) +// CHECK-NOT: affine.linearize +// CHECK: return %[[ARG0]] +func.func @linearize_fold_only_outer_bound(%arg0: index) -> index { + %ret = affine.linearize_index [%arg0] by (2) : index + return %ret : index +} + +// ----- + // CHECK-LABEL: @linearize_dont_fold_dynamic_basis // CHECK: %[[RET:.+]] = affine.linearize_index // CHECK: return %[[RET]] @@ -1617,6 +1707,38 @@ func.func @cancel_delinearize_linearize_disjoint_exact(%arg0: index, %arg1: inde // ----- +// CHECK-LABEL: func @cancel_delinearize_linearize_disjoint_linearize_extra_bound( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG4:[a-zA-Z0-9]+]]: index) +// CHECK: return %[[ARG0]], %[[ARG1]], %[[ARG2]] +func.func @cancel_delinearize_linearize_disjoint_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { + %0 = affine.linearize_index disjoint [%arg0, %arg1, %arg2] by (4, %arg4) : index + %1:3 = affine.delinearize_index %0 into (4, %arg4) + : index, index, index + return %1#0, %1#1, %1#2 : index, index, index +} + +// ----- + +// CHECK-LABEL: func @cancel_delinearize_linearize_disjoint_delinearize_extra_bound( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG4:[a-zA-Z0-9]+]]: index) +// CHECK: return %[[ARG0]], %[[ARG1]], %[[ARG2]] +func.func @cancel_delinearize_linearize_disjoint_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) { + %0 = affine.linearize_index disjoint [%arg0, %arg1, %arg2] by (4, %arg4) : index + %1:3 = affine.delinearize_index %0 into (%arg3, 4, %arg4) + : index, index, index + return %1#0, %1#1, %1#2 : index, index, index +} + +// ----- + // Without `disjoint`, the cancelation isn't guaranteed to be the identity. // CHECK-LABEL: func @no_cancel_delinearize_linearize_exact( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, @@ -1666,6 +1788,17 @@ func.func @linearize_unit_basis_disjoint(%arg0: index, %arg1: index, %arg2: inde // ----- +// CHECK-LABEL: @linearize_unit_basis_disjoint_no_outer_bound +// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index) +// CHECK: %[[ret:.+]] = affine.linearize_index disjoint [%[[arg0]], %[[arg2]]] by (%[[arg3]]) : index +// CHECK: return %[[ret]] +func.func @linearize_unit_basis_disjoint_no_outer_bound(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index { + %ret = affine.linearize_index disjoint [%arg0, %arg1, %arg2] by (1, %arg3) : index + return %ret : index +} + +// ----- + // CHECK-LABEL: @linearize_unit_basis_zero // CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index) // CHECK: %[[ret:.+]] = affine.linearize_index [%[[arg0]], %[[arg1]]] by (3, %[[arg2]]) : index @@ -1713,6 +1846,32 @@ func.func @cancel_linearize_denearize_exact(%arg0: index, %arg1: index, %arg2: i // ----- +// CHECK-LABEL: func @cancel_linearize_denearize_linearize_extra_bound( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) +// CHECK: return %[[ARG0]] +func.func @cancel_linearize_denearize_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (4, %arg2) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 4, %arg2) : index + return %1 : index +} + +// ----- + +// CHECK-LABEL: func @cancel_linearize_denearize_delinearize_extra_bound( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index) +// CHECK: return %[[ARG0]] +func.func @cancel_linearize_denearize_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index { + %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index + %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (4, %arg2) : index + return %1 : index +} + +// ----- + // Don't cancel because the values from the delinearize aren't used in order // CHECK-LABEL: func @no_cancel_linearize_denearize_permuted( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, @@ -1756,3 +1915,16 @@ func.func @affine_leading_zero(%arg0: index, %arg1: index) -> index { return %ret : index } +// ----- + +// CHECK-LABEL: func @affine_leading_zero_no_outer_bound( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index) +// CHECK: %[[RET:.+]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (3, 5) +// CHECK: return %[[RET]] +func.func @affine_leading_zero_no_outer_bound(%arg0: index, %arg1: index) -> index { + %c0 = arith.constant 0 : index + %ret = affine.linearize_index [%c0, %arg0, %arg1] by (3, 5) : index + return %ret : index +} + diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir index 2996194170900..1539b4f484827 100644 --- a/mlir/test/Dialect/Affine/invalid.mlir +++ b/mlir/test/Dialect/Affine/invalid.mlir @@ -533,37 +533,29 @@ func.func @missing_for_min(%arg0: index, %arg1: index, %arg2: memref<100xf32>) { // ----- func.func @delinearize(%idx: index, %basis0: index, %basis1 :index) { - // expected-error@+1 {{'affine.delinearize_index' op should return an index for each basis element}} + // expected-error@+1 {{'affine.delinearize_index' op should return an index for each basis element and up to one extra index}} %1 = affine.delinearize_index %idx into (%basis0, %basis1) : index return } // ----- -func.func @delinearize(%idx: index, %basis0: index, %basis1 :index) { - // expected-error@+1 {{'affine.delinearize_index' op basis should not be empty}} - affine.delinearize_index %idx into () : index +func.func @delinearize(%idx: index) { + // expected-error@+1 {{'affine.delinearize_index' op no basis element may be statically non-positive}} + %1:2 = affine.delinearize_index %idx into (2, -2) : index, index return } // ----- func.func @linearize(%idx: index, %basis0: index, %basis1 :index) -> index { - // expected-error@+1 {{'affine.linearize_index' op should be passed an index for each basis element}} + // expected-error@+1 {{'affine.linearize_index' op should be passed a basis element for each index except possibly the first}} %0 = affine.linearize_index [%idx] by (%basis0, %basis1) : index return %0 : index } // ----- -func.func @linearize_empty() -> index { - // expected-error@+1 {{'affine.linearize_index' op basis should not be empty}} - %0 = affine.linearize_index [] by () : index - return %0 : index -} - -// ----- - func.func @dynamic_dimension_index() { "unknown.region"() ({ %idx = "unknown.test"() : () -> (index) diff --git a/mlir/test/python/dialects/affine.py b/mlir/test/python/dialects/affine.py index 7faae6ccedc97..7ef128c1724c4 100644 --- a/mlir/test/python/dialects/affine.py +++ b/mlir/test/python/dialects/affine.py @@ -50,7 +50,7 @@ def testAffineDelinearizeInfer(): # CHECK: %[[C1:.*]] = arith.constant 1 : index c1 = arith.ConstantOp(T.index(), 1) # CHECK: %{{.*}}:2 = affine.delinearize_index %[[C1:.*]] into (2, 3) : index, index - two_indices = affine.AffineDelinearizeIndexOp(c1, [], [2, 3]) + two_indices = affine.AffineDelinearizeIndexOp([T.index()] * 2, c1, [], [2, 3]) # CHECK-LABEL: TEST: testAffineLoadOp @@ -157,7 +157,7 @@ def testAffineForOpErrors(): ) try: - two_indices = affine.AffineDelinearizeIndexOp(c1, [], [1, 1]) + two_indices = affine.AffineDelinearizeIndexOp([T.index()] * 2, c1, [], [1, 1]) affine.AffineForOp( two_indices, c2, From 55876278d362020503db5f0e66313829c40ff640 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Mon, 18 Nov 2024 13:47:08 -0800 Subject: [PATCH 043/366] [NVPTX] Add support for f16 fabs (#116107) Add support for f16 and f16x2 support for abs. See PTX ISA 9.7.4.6. Half Precision Floating Point Instructions: abs https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 21 +++-- llvm/test/CodeGen/NVPTX/f16-abs.ll | 98 +++++++++++++++++++++ 2 files changed, 110 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/f16-abs.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 4ad0200ca5cf8..e93430a27dc32 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -862,16 +862,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(Op, MVT::bf16, Promote); AddPromotedToType(Op, MVT::bf16, MVT::f32); } - for (const auto &Op : {ISD::FABS}) { - setOperationAction(Op, MVT::f16, Promote); - setOperationAction(Op, MVT::f32, Legal); - setOperationAction(Op, MVT::f64, Legal); - setOperationAction(Op, MVT::v2f16, Expand); - setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); - setBF16OperationAction(Op, MVT::bf16, Legal, Promote); - if (getOperationAction(Op, MVT::bf16) == Promote) - AddPromotedToType(Op, MVT::bf16, MVT::f32); + + setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal); + if (STI.getPTXVersion() >= 65) { + setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote); + setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand); + } else { + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FABS, MVT::v2f16, Expand); } + setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand); + setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote); + if (getOperationAction(ISD::FABS, MVT::bf16) == Promote) + AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32); for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { setOperationAction(Op, MVT::f32, Legal); diff --git a/llvm/test/CodeGen/NVPTX/f16-abs.ll b/llvm/test/CodeGen/NVPTX/f16-abs.ll new file mode 100644 index 0000000000000..d12653e813bd1 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f16-abs.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; ## FP16 abs is not supported by PTX version (PTX < 65). +; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx60 \ +; RUN: -O0 -disable-post-ra -verify-machineinstrs \ +; RUN: | FileCheck -check-prefix CHECK-NOF16 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx60 \ +; RUN: -O0 -disable-post-ra -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} + +; ## FP16 support explicitly disabled (--nvptx-no-f16-math). +; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 --nvptx-no-f16-math \ +; RUN: -O0 -disable-post-ra -verify-machineinstrs \ +; RUN: | FileCheck -check-prefix CHECK-NOF16 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 --nvptx-no-f16-math \ +; RUN: -O0 -disable-post-ra -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} + +; ## FP16 is not supported by hardware (SM < 53). +; RUN: llc < %s -mcpu=sm_52 -mattr=+ptx65 \ +; RUN: -O0 -disable-post-ra -verify-machineinstrs \ +; RUN: | FileCheck -check-prefix CHECK-NOF16 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_52 -mattr=+ptx65 \ +; RUN: -O0 -disable-post-ra -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_52 \ +; RUN: %} + +; ## Full FP16 abs support. +; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 \ +; RUN: -O0 -disable-post-ra -verify-machineinstrs \ +; RUN: | FileCheck -check-prefix CHECK-F16-ABS %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 \ +; RUN: -O0 -disable-post-ra -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} + +target triple = "nvptx64-nvidia-cuda" + +declare half @llvm.fabs.f16(half %a) +declare <2 x half> @llvm.fabs.v2f16(<2 x half> %a) + +define half @test_fabs(half %a) { +; CHECK-NOF16-LABEL: test_fabs( +; CHECK-NOF16: { +; CHECK-NOF16-NEXT: .reg .b16 %rs<3>; +; CHECK-NOF16-NEXT: .reg .f32 %f<3>; +; CHECK-NOF16-EMPTY: +; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [test_fabs_param_0]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs1; +; CHECK-NOF16-NEXT: abs.f32 %f2, %f1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs2, %f2; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NOF16-NEXT: ret; +; +; CHECK-F16-ABS-LABEL: test_fabs( +; CHECK-F16-ABS: { +; CHECK-F16-ABS-NEXT: .reg .b16 %rs<3>; +; CHECK-F16-ABS-EMPTY: +; CHECK-F16-ABS-NEXT: // %bb.0: +; CHECK-F16-ABS-NEXT: ld.param.b16 %rs1, [test_fabs_param_0]; +; CHECK-F16-ABS-NEXT: abs.f16 %rs2, %rs1; +; CHECK-F16-ABS-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-F16-ABS-NEXT: ret; + %r = call half @llvm.fabs.f16(half %a) + ret half %r +} + +define <2 x half> @test_fabs_2(<2 x half> %a) #0 { +; CHECK-F16-LABEL: test_fabs_2( +; CHECK-F16: { +; CHECK-F16-NEXT: .reg .b32 %r<5>; +; CHECK-F16-EMPTY: +; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fabs_2_param_0]; +; CHECK-F16-NEXT: and.b32 %r3, %r1, 2147450879; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-F16-NEXT: ret; +; +; CHECK-F16-ABS-LABEL: test_fabs_2( +; CHECK-F16-ABS: { +; CHECK-F16-ABS-NEXT: .reg .b32 %r<3>; +; CHECK-F16-ABS-EMPTY: +; CHECK-F16-ABS-NEXT: // %bb.0: +; CHECK-F16-ABS-NEXT: ld.param.b32 %r1, [test_fabs_2_param_0]; +; CHECK-F16-ABS-NEXT: abs.f16x2 %r2, %r1; +; CHECK-F16-ABS-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-F16-ABS-NEXT: ret; + %r = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) + ret <2 x half> %r +} + From e0b522dd94e48229d587a54a3103ba1c198b16a7 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Mon, 18 Nov 2024 13:56:33 -0800 Subject: [PATCH 044/366] [DirectX] Fix crash in DXILFlattenArrays for function declarations (#116690) We were skipping intrinsics here, but really we need to skip all function declarations - if the function doesn't have a body there's nothing to walk. --- llvm/lib/Target/DirectX/DXILFlattenArrays.cpp | 2 +- llvm/test/CodeGen/DirectX/flatten-array.ll | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp index dec3a9b4a8264..e4a3bc76eeacd 100644 --- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp +++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp @@ -402,7 +402,7 @@ static bool flattenArrays(Module &M) { DenseMap GlobalMap; flattenGlobalArrays(M, GlobalMap); for (auto &F : make_early_inc_range(M.functions())) { - if (F.isIntrinsic()) + if (F.isDeclaration()) continue; MadeChange |= Impl.visit(F); } diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll index fd894e0104c4e..754d5a25ca905 100644 --- a/llvm/test/CodeGen/DirectX/flatten-array.ll +++ b/llvm/test/CodeGen/DirectX/flatten-array.ll @@ -186,3 +186,6 @@ define void @global_gep_store() { store i32 1, i32* %3, align 4 ret void } + +; Make sure we don't try to walk the body of a function declaration. +declare void @opaque_function() From a4e1a3dc8bc9bb971d8a38130254b4570f8b7a03 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 18 Nov 2024 14:09:21 -0800 Subject: [PATCH 045/366] [memprof] Add another constructor to IndexedAllocationInfo (NFC) (#116684) This patch adds another constructor to IndexedAllocationInfo that is identical to the existing constructor except that the new one leaves the CallStack field empty. I'm planning to remove MemProf format Version 1. Then we will migrate the users of the existing constructor to the new one as nobody will be using the CallStack field anymore. Adding the new constructor now allows us to migrate a few existing users of the old constructor even before we remove the CallStack field. In turn, that simplifies the patch to actually remove the field. --- llvm/include/llvm/ProfileData/MemProf.h | 5 +++++ llvm/unittests/ProfileData/InstrProfTest.cpp | 3 +-- llvm/unittests/ProfileData/MemProfTest.cpp | 10 ++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 6a29e3df9629b..9415e554bcc0a 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -354,10 +354,15 @@ struct IndexedAllocationInfo { PortableMemInfoBlock Info; IndexedAllocationInfo() = default; + // This constructor is soft deprecated. It will be removed once we remove all + // users of the CallStack field. IndexedAllocationInfo(ArrayRef CS, CallStackId CSId, const MemInfoBlock &MB, const MemProfSchema &Schema = getFullSchema()) : CallStack(CS), CSId(CSId), Info(MB, Schema) {} + IndexedAllocationInfo(CallStackId CSId, const MemInfoBlock &MB, + const MemProfSchema &Schema = getFullSchema()) + : CSId(CSId), Info(MB, Schema) {} // Returns the size in bytes when this allocation info struct is serialized. size_t serializedSize(const MemProfSchema &Schema, diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp index b9f244104c65c..5a313aa4182a5 100644 --- a/llvm/unittests/ProfileData/InstrProfTest.cpp +++ b/llvm/unittests/ProfileData/InstrProfTest.cpp @@ -415,8 +415,7 @@ makeRecordV2(std::initializer_list<::llvm::memprof::CallStackId> AllocFrames, for (const auto &CSId : AllocFrames) // We don't populate IndexedAllocationInfo::CallStack because we use it only // in Version1. - MR.AllocSites.emplace_back(::llvm::SmallVector(), CSId, - Block, Schema); + MR.AllocSites.emplace_back(CSId, Block, Schema); for (const auto &CSId : CallSiteFrames) MR.CallSiteIds.push_back(CSId); return MR; diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp index c90669811e60a..5097dbdd6c391 100644 --- a/llvm/unittests/ProfileData/MemProfTest.cpp +++ b/llvm/unittests/ProfileData/MemProfTest.cpp @@ -315,7 +315,7 @@ TEST(MemProf, RecordSerializationRoundTripVerion2) { IndexedMemProfRecord Record; for (const auto &CSId : CallStackIds) { // Use the same info block for both allocation sites. - Record.AllocSites.emplace_back(llvm::SmallVector(), CSId, Info); + Record.AllocSites.emplace_back(CSId, Info); } Record.CallSiteIds.assign(CallSiteIds); @@ -346,8 +346,7 @@ TEST(MemProf, RecordSerializationRoundTripVersion2HotColdSchema) { IndexedMemProfRecord Record; for (const auto &CSId : CallStackIds) { // Use the same info block for both allocation sites. - Record.AllocSites.emplace_back(llvm::SmallVector(), CSId, Info, - Schema); + Record.AllocSites.emplace_back(CSId, Info, Schema); } Record.CallSiteIds.assign(CallSiteIds); @@ -510,7 +509,6 @@ TEST(MemProf, BaseMemProfReaderWithCSIdMap) { Block.AllocCount = 1U, Block.TotalAccessDensity = 4, Block.TotalLifetime = 200001; FakeRecord.AllocSites.emplace_back( - /*CS=*/llvm::SmallVector(), /*CSId=*/llvm::memprof::hashCallStack(CallStack), /*MB=*/Block); ProfData.insert({F1.hash(), FakeRecord}); @@ -610,7 +608,7 @@ MemInfoBlock makePartialMIB() { TEST(MemProf, MissingCallStackId) { // Use a non-existent CallStackId to trigger a mapping error in // toMemProfRecord. - llvm::memprof::IndexedAllocationInfo AI({}, 0xdeadbeefU, makePartialMIB(), + llvm::memprof::IndexedAllocationInfo AI(0xdeadbeefU, makePartialMIB(), llvm::memprof::getHotColdSchema()); IndexedMemProfRecord IndexedMR; @@ -633,7 +631,7 @@ TEST(MemProf, MissingCallStackId) { } TEST(MemProf, MissingFrameId) { - llvm::memprof::IndexedAllocationInfo AI({}, 0x222, makePartialMIB(), + llvm::memprof::IndexedAllocationInfo AI(0x222, makePartialMIB(), llvm::memprof::getHotColdSchema()); IndexedMemProfRecord IndexedMR; From ad9c0b369e86e75d56e229f294782a4eaf527226 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 18 Nov 2024 13:49:04 -0800 Subject: [PATCH 046/366] [SLP]Check if the gathered loads form full vector before attempting build it Need to check that the number of gathered loads in the slice forms the build vector to avoid compiler crash. Fixes #116691 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 27 ++-- .../X86/gathered-loads-non-full-reg.ll | 140 ++++++++++++++++++ 2 files changed, 156 insertions(+), 11 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/gathered-loads-non-full-reg.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 918d7663548f5..dc0dffd9fcbf8 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6815,16 +6815,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads( // Check if it is profitable to try vectorizing gathered loads. It is // profitable if we have more than 3 consecutive loads or if we have // less but all users are vectorized or deleted. - bool AllowToVectorize = - NumElts >= 3 || - any_of(ValueToGatherNodes.at(Slice.front()), - [=](const TreeEntry *TE) { - return TE->Scalars.size() == 2 && - ((TE->Scalars.front() == Slice.front() && - TE->Scalars.back() == Slice.back()) || - (TE->Scalars.front() == Slice.back() && - TE->Scalars.back() == Slice.front())); - }); + bool AllowToVectorize = false; // Check if it is profitable to vectorize 2-elements loads. if (NumElts == 2) { bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad( @@ -6861,6 +6852,19 @@ void BoUpSLP::tryToVectorizeGatheredLoads( return true; }; AllowToVectorize = CheckIfAllowed(Slice); + } else { + AllowToVectorize = + (NumElts >= 3 || + any_of(ValueToGatherNodes.at(Slice.front()), + [=](const TreeEntry *TE) { + return TE->Scalars.size() == 2 && + ((TE->Scalars.front() == Slice.front() && + TE->Scalars.back() == Slice.back()) || + (TE->Scalars.front() == Slice.back() && + TE->Scalars.back() == Slice.front())); + })) && + hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), + Slice.size()); } if (AllowToVectorize) { SmallVector PointerOps; @@ -6903,7 +6907,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( } // Mark masked gathers candidates as vectorized, if any. for (unsigned Cnt : MaskedGatherVectorized) { - ArrayRef Slice = ArrayRef(Loads).slice(Cnt, NumElts); + ArrayRef Slice = ArrayRef(Loads).slice( + Cnt, std::min(NumElts, Loads.size() - Cnt)); ArrayRef Values( reinterpret_cast(Slice.begin()), Slice.size()); Results.emplace_back(Values, LoadsState::ScatterVectorize); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-loads-non-full-reg.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-loads-non-full-reg.ll new file mode 100644 index 0000000000000..79aba19ab02e1 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-loads-non-full-reg.ll @@ -0,0 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mcpu=cascadelake < %s | FileCheck %s + +@solid_ = external global [608 x i8] + +define void @test(ptr noalias %0) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr noalias [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[_LR_PH1019:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 128 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 200 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 208 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 232 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 288 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 320 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP0]], i64 304 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 424 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 480 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 504 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 632 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP0]], i64 720 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = fadd double [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load double, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = fadd double [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = load double, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = load double, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP29:%.*]] = fadd double [[TMP28]], [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = fmul double [[TMP22]], [[TMP18]] +; CHECK-NEXT: [[TMP31:%.*]] = fmul double [[TMP30]], 0.000000e+00 +; CHECK-NEXT: [[TMP32:%.*]] = fsub double 0.000000e+00, [[TMP25]] +; CHECK-NEXT: [[TMP33:%.*]] = fmul double [[TMP32]], 0.000000e+00 +; CHECK-NEXT: [[TMP34:%.*]] = fadd double [[TMP33]], 0.000000e+00 +; CHECK-NEXT: [[TMP35:%.*]] = fmul double [[TMP34]], 0.000000e+00 +; CHECK-NEXT: [[TMP36:%.*]] = fmul double [[TMP29]], [[TMP26]] +; CHECK-NEXT: [[TMP37:%.*]] = fmul double [[TMP36]], 0.000000e+00 +; CHECK-NEXT: [[TMP38:%.*]] = fadd double [[TMP37]], 0.000000e+00 +; CHECK-NEXT: [[TMP39:%.*]] = fsub double [[TMP17]], [[TMP19]] +; CHECK-NEXT: [[TMP40:%.*]] = fmul double [[TMP39]], [[TMP23]] +; CHECK-NEXT: [[TMP41:%.*]] = fmul double [[TMP40]], 0.000000e+00 +; CHECK-NEXT: [[TMP42:%.*]] = load double, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP43:%.*]] = load double, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP44:%.*]] = fmul double [[TMP43]], [[TMP31]] +; CHECK-NEXT: [[TMP45:%.*]] = load double, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP46:%.*]] = fmul double [[TMP35]], 0.000000e+00 +; CHECK-NEXT: [[TMP47:%.*]] = fadd double [[TMP44]], 0.000000e+00 +; CHECK-NEXT: [[TMP48:%.*]] = fmul double [[TMP45]], [[TMP38]] +; CHECK-NEXT: [[TMP49:%.*]] = fmul double [[TMP45]], [[TMP41]] +; CHECK-NEXT: store double [[TMP46]], ptr getelementptr inbounds (i8, ptr @solid_, i64 384), align 8 +; CHECK-NEXT: store double [[TMP47]], ptr getelementptr inbounds (i8, ptr @solid_, i64 408), align 8 +; CHECK-NEXT: store double [[TMP48]], ptr getelementptr inbounds (i8, ptr @solid_, i64 392), align 8 +; CHECK-NEXT: store double [[TMP49]], ptr getelementptr inbounds (i8, ptr @solid_, i64 400), align 8 +; CHECK-NEXT: [[DOTNEG965:%.*]] = fmul double [[TMP48]], [[TMP24]] +; CHECK-NEXT: [[REASS_ADD993:%.*]] = fadd double [[DOTNEG965]], 0.000000e+00 +; CHECK-NEXT: [[TMP50:%.*]] = fadd double [[TMP42]], [[REASS_ADD993]] +; CHECK-NEXT: [[TMP51:%.*]] = fsub double 0.000000e+00, [[TMP50]] +; CHECK-NEXT: store double [[TMP51]], ptr getelementptr inbounds (i8, ptr @solid_, i64 296), align 8 +; CHECK-NEXT: [[DOTNEG969:%.*]] = fmul double [[TMP49]], 0.000000e+00 +; CHECK-NEXT: [[REASS_ADD996:%.*]] = fadd double [[DOTNEG969]], 0.000000e+00 +; CHECK-NEXT: [[TMP52:%.*]] = fadd double [[TMP45]], [[REASS_ADD996]] +; CHECK-NEXT: [[TMP53:%.*]] = fsub double 0.000000e+00, [[TMP52]] +; CHECK-NEXT: store double [[TMP53]], ptr getelementptr inbounds (i8, ptr @solid_, i64 304), align 8 +; CHECK-NEXT: ret void +; +.lr.ph1019: + %1 = getelementptr i8, ptr %0, i64 8 + %2 = getelementptr i8, ptr %0, i64 32 + %3 = getelementptr i8, ptr %0, i64 128 + %4 = getelementptr i8, ptr %0, i64 200 + %5 = getelementptr i8, ptr %0, i64 208 + %6 = getelementptr i8, ptr %0, i64 232 + %7 = getelementptr i8, ptr %0, i64 288 + %8 = getelementptr i8, ptr %0, i64 320 + %9 = getelementptr i8, ptr %0, i64 304 + %10 = getelementptr i8, ptr %0, i64 424 + %11 = getelementptr i8, ptr %0, i64 480 + %12 = getelementptr i8, ptr %0, i64 504 + %13 = getelementptr i8, ptr %0, i64 632 + %14 = getelementptr i8, ptr %0, i64 720 + %15 = load double, ptr %1, align 8 + %16 = load double, ptr %2, align 8 + %17 = fadd double %16, %15 + %18 = load double, ptr %3, align 8 + %19 = load double, ptr %4, align 8 + %20 = load double, ptr %5, align 8 + %21 = load double, ptr %6, align 8 + %22 = fadd double %21, %20 + %23 = load double, ptr %7, align 8 + %24 = load double, ptr %8, align 8 + %25 = load double, ptr %9, align 8 + %26 = load double, ptr %10, align 8 + %27 = load double, ptr %11, align 8 + %28 = load double, ptr %12, align 8 + %29 = fadd double %28, %27 + %30 = fmul double %22, %18 + %31 = fmul double %30, 0.000000e+00 + %32 = fsub double 0.000000e+00, %25 + %33 = fmul double %32, 0.000000e+00 + %34 = fadd double %33, 0.000000e+00 + %35 = fmul double %34, 0.000000e+00 + %36 = fmul double %29, %26 + %37 = fmul double %36, 0.000000e+00 + %38 = fadd double %37, 0.000000e+00 + %39 = fsub double %17, %19 + %40 = fmul double %39, %23 + %41 = fmul double %40, 0.000000e+00 + %42 = load double, ptr %0, align 8 + %43 = load double, ptr %13, align 8 + %44 = fmul double %43, %31 + %45 = load double, ptr %14, align 8 + %46 = fmul double %35, 0.000000e+00 + %47 = fadd double %44, 0.000000e+00 + %48 = fmul double %45, %38 + %49 = fmul double %45, %41 + store double %46, ptr getelementptr inbounds (i8, ptr @solid_, i64 384), align 8 + store double %47, ptr getelementptr inbounds (i8, ptr @solid_, i64 408), align 8 + store double %48, ptr getelementptr inbounds (i8, ptr @solid_, i64 392), align 8 + store double %49, ptr getelementptr inbounds (i8, ptr @solid_, i64 400), align 8 + %.neg965 = fmul double %48, %24 + %reass.add993 = fadd double %.neg965, 0.000000e+00 + %50 = fadd double %42, %reass.add993 + %51 = fsub double 0.000000e+00, %50 + store double %51, ptr getelementptr inbounds (i8, ptr @solid_, i64 296), align 8 + %.neg969 = fmul double %49, 0.000000e+00 + %reass.add996 = fadd double %.neg969, 0.000000e+00 + %52 = fadd double %45, %reass.add996 + %53 = fsub double 0.000000e+00, %52 + store double %53, ptr getelementptr inbounds (i8, ptr @solid_, i64 304), align 8 + ret void +} From b083340cb663b6bd785dbd5864e5afd950745e35 Mon Sep 17 00:00:00 2001 From: Youngsuk Kim Date: Mon, 18 Nov 2024 17:12:19 -0500 Subject: [PATCH 047/366] [llvm][NVPTX] Don't reorder MIs that construct a PTX function call (#116522) With "-enable-misched", MachineScheduler can reorder MIs that must stick together (in initially set order) to generate legal PTX code for a function call. When generating PTX code for the attached test (using LLVM before this revision), the following invalid PTX code is generated: ``` { // callseq 0, 0 .param .b64 param0; st.param.f64 [param0], 0d0000000000000000; .param .b64 retval0; call.uni (retval0), mul.lo.s32 %r7, %r10, %r3; or.b32 %r8, %r4, %r7; mul.lo.s32 %r9, %r2, %r8; cvt.rn.f64.s32 %fd3, %r9; quux, ( param0 ); ld.param.f64 %fd1, [retval0]; } // callseq 0 ``` --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp | 20 +++++++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.h | 3 + llvm/test/CodeGen/NVPTX/misched_func_call.ll | 59 ++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/misched_func_call.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 4661c059d5f78..b4dbe6a0930ca 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -199,3 +199,23 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB, BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB); return 2; } + +bool NVPTXInstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // Prevent the scheduler from reordering & splitting up MachineInstrs + // which must stick together (in initially set order) to + // comprise a valid PTX function call sequence. + switch (MI.getOpcode()) { + case NVPTX::CallUniPrintCallRetInst1: + case NVPTX::CallArgBeginInst: + case NVPTX::CallArgI32imm: + case NVPTX::CallArgParam: + case NVPTX::LastCallArgI32imm: + case NVPTX::LastCallArgParam: + case NVPTX::CallArgEndInst1: + return true; + } + + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); +} diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h index f674a00bc351b..a1d9f01712018 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -67,6 +67,9 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo { MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; + bool isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; }; } // namespace llvm diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll new file mode 100644 index 0000000000000..e036753ce9030 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O3 -march=nvptx64 -enable-misched %s -o - | FileCheck %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { +; CHECK-LABEL: wombat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<11>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %bb +; CHECK-NEXT: ld.param.u32 %r4, [wombat_param_2]; +; CHECK-NEXT: ld.param.u32 %r3, [wombat_param_1]; +; CHECK-NEXT: ld.param.u32 %r2, [wombat_param_0]; +; CHECK-NEXT: mov.b32 %r10, 0; +; CHECK-NEXT: mov.u64 %rd1, 0; +; CHECK-NEXT: mov.b32 %r6, 1; +; CHECK-NEXT: $L__BB0_1: // %bb3 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.f64 [param0], 0d0000000000000000; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: quux, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3; +; CHECK-NEXT: or.b32 %r8, %r4, %r7; +; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8; +; CHECK-NEXT: cvt.rn.f64.s32 %fd3, %r9; +; CHECK-NEXT: ld.param.f64 %fd1, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: cvt.rn.f64.u32 %fd4, %r10; +; CHECK-NEXT: add.rn.f64 %fd5, %fd4, %fd3; +; CHECK-NEXT: st.global.f64 [%rd1], %fd5; +; CHECK-NEXT: mov.u32 %r10, %r6; +; CHECK-NEXT: bra.uni $L__BB0_1; +bb: + br label %bb3 + +bb3: ; preds = %bb3, %bb + %phi = phi i32 [ 0, %bb ], [ 1, %bb3 ] + %call = tail call double @quux(double 0.000000e+00) + %mul = mul i32 %phi, %arg1 + %or = or i32 %arg2, %mul + %mul4 = mul i32 %arg, %or + %sitofp = sitofp i32 %mul4 to double + %uitofp = uitofp i32 %phi to double + %fadd = fadd double %uitofp, %sitofp + store double %fadd, ptr addrspace(1) null, align 8 + br label %bb3 +} + +declare double @quux(double) From ec67ad594b82fc2e763237d4e8d6bb2aea59110b Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 18 Nov 2024 23:23:31 +0100 Subject: [PATCH 048/366] [libc++][NFC] Format --- libcxx/include/string | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/include/string b/libcxx/include/string index a994f65a9a6e4..bf7fc3c37ecd7 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -3374,7 +3374,7 @@ template inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target_capacity) { __annotate_delete(); - auto __guard = std::__make_scope_guard(__annotate_new_size(*this)); + auto __guard = std::__make_scope_guard(__annotate_new_size(*this)); size_type __cap = capacity(); size_type __sz = size(); From 50209e994200c98236a27b54e87e8c598d160402 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 18 Nov 2024 22:31:13 +0000 Subject: [PATCH 049/366] [AArch64][GlobalISel] Move and update freeze.ll test. NFC This adds a number of extra vector cases, notably the ptr vectors. --- .../test/CodeGen/AArch64/GlobalISel/freeze.ll | 149 ------- .../legalize-extract-vector-elt.mir | 84 +++- llvm/test/CodeGen/AArch64/freeze.ll | 382 ++++++++++++++++++ 3 files changed, 444 insertions(+), 171 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/freeze.ll create mode 100644 llvm/test/CodeGen/AArch64/freeze.ll diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/freeze.ll b/llvm/test/CodeGen/AArch64/GlobalISel/freeze.ll deleted file mode 100644 index a793ecbf03f65..0000000000000 --- a/llvm/test/CodeGen/AArch64/GlobalISel/freeze.ll +++ /dev/null @@ -1,149 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s 2>&1 | FileCheck %s -; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s --check-prefix=GISEL - -%struct.T = type { i32, i32 } - -define i32 @freeze_int() { -; CHECK-LABEL: freeze_int: -; CHECK: // %bb.0: -; CHECK-NEXT: mul w0, w8, w8 -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_int: -; GISEL: // %bb.0: -; GISEL-NEXT: mul w0, w8, w8 -; GISEL-NEXT: ret - %y1 = freeze i32 undef - %t1 = mul i32 %y1, %y1 - ret i32 %t1 -} - -define i5 @freeze_int2() { -; CHECK-LABEL: freeze_int2: -; CHECK: // %bb.0: -; CHECK-NEXT: mul w0, w8, w8 -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_int2: -; GISEL: // %bb.0: -; GISEL-NEXT: mul w0, w8, w8 -; GISEL-NEXT: ret - %y1 = freeze i5 undef - %t1 = mul i5 %y1, %y1 - ret i5 %t1 -} - -define float @freeze_float() { -; CHECK-LABEL: freeze_float: -; CHECK: // %bb.0: -; CHECK-NEXT: fadd s0, s0, s0 -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_float: -; GISEL: // %bb.0: -; GISEL-NEXT: fadd s0, s0, s0 -; GISEL-NEXT: ret - %y1 = freeze float undef - %t1 = fadd float %y1, %y1 - ret float %t1 -} - -define <2 x i32> @freeze_ivec() { -; CHECK-LABEL: freeze_ivec: -; CHECK: // %bb.0: -; CHECK-NEXT: add v0.2s, v0.2s, v0.2s -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_ivec: -; GISEL: // %bb.0: -; GISEL-NEXT: add v0.2s, v0.2s, v0.2s -; GISEL-NEXT: ret - %y1 = freeze <2 x i32> undef - %t1 = add <2 x i32> %y1, %y1 - ret <2 x i32> %t1 -} - -define ptr @freeze_ptr() { -; CHECK-LABEL: freeze_ptr: -; CHECK: // %bb.0: -; CHECK-NEXT: add x0, x8, #4 -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_ptr: -; GISEL: // %bb.0: -; GISEL-NEXT: add x0, x8, #4 -; GISEL-NEXT: ret - %y1 = freeze ptr undef - %t1 = getelementptr i8, ptr %y1, i64 4 - ret ptr %t1 -} - -define i32 @freeze_struct() { -; CHECK-LABEL: freeze_struct: -; CHECK: // %bb.0: -; CHECK-NEXT: add w0, w8, w8 -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_struct: -; GISEL: // %bb.0: -; GISEL-NEXT: add w0, w8, w8 -; GISEL-NEXT: ret - %y1 = freeze %struct.T undef - %v1 = extractvalue %struct.T %y1, 0 - %v2 = extractvalue %struct.T %y1, 1 - %t1 = add i32 %v1, %v2 - ret i32 %t1 -} - -define i32 @freeze_anonstruct() { -; CHECK-LABEL: freeze_anonstruct: -; CHECK: // %bb.0: -; CHECK-NEXT: add w0, w8, w8 -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_anonstruct: -; GISEL: // %bb.0: -; GISEL-NEXT: add w0, w8, w8 -; GISEL-NEXT: ret - %y1 = freeze {i32, i32} undef - %v1 = extractvalue {i32, i32} %y1, 0 - %v2 = extractvalue {i32, i32} %y1, 1 - %t1 = add i32 %v1, %v2 - ret i32 %t1 -} - -define i32 @freeze_anonstruct2() { -; CHECK-LABEL: freeze_anonstruct2: -; CHECK: // %bb.0: -; CHECK-NEXT: add w0, w8, w8, uxth -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_anonstruct2: -; GISEL: // %bb.0: -; GISEL-NEXT: add w0, w8, w8, uxth -; GISEL-NEXT: ret - %y1 = freeze {i32, i16} undef - %v1 = extractvalue {i32, i16} %y1, 0 - %v2 = extractvalue {i32, i16} %y1, 1 - %z2 = zext i16 %v2 to i32 - %t1 = add i32 %v1, %z2 - ret i32 %t1 -} - -define i64 @freeze_array() { -; CHECK-LABEL: freeze_array: -; CHECK: // %bb.0: -; CHECK-NEXT: add x0, x8, x8 -; CHECK-NEXT: ret -; -; GISEL-LABEL: freeze_array: -; GISEL: // %bb.0: -; GISEL-NEXT: add x0, x8, x8 -; GISEL-NEXT: ret - %y1 = freeze [2 x i64] undef - %v1 = extractvalue [2 x i64] %y1, 0 - %v2 = extractvalue [2 x i64] %y1, 1 - %t1 = add i64 %v1, %v2 - ret i64 %t1 -} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir index 323a3993473fc..29bd1f8feb5c4 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir @@ -7,7 +7,9 @@ body: | bb.0: liveins: $q0 ; CHECK-LABEL: name: test_eve_1 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[C]](s64) ; CHECK-NEXT: $x0 = COPY [[EVEC]](s64) @@ -24,7 +26,9 @@ body: | bb.0: liveins: $q0, $q1 ; CHECK-LABEL: name: test_eve_v2s1 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<2 x s64>) = G_ICMP intpred(eq), [[COPY]](<2 x s64>), [[COPY1]] @@ -46,7 +50,9 @@ body: | bb.0: liveins: $q0, $q1 ; CHECK-LABEL: name: test_eve_v4s1 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), [[COPY]](<4 x s32>), [[COPY1]] @@ -69,7 +75,9 @@ body: | bb.0: liveins: $q0, $q1 ; CHECK-LABEL: name: test_eve_v8s1 - ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s16>) = G_ICMP intpred(eq), [[COPY]](<8 x s16>), [[COPY1]] @@ -92,7 +100,9 @@ body: | bb.0: liveins: $q0, $q1 ; CHECK-LABEL: name: test_eve_v16s1 - ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<16 x s8>) = G_ICMP intpred(eq), [[COPY]](<16 x s8>), [[COPY1]] @@ -115,7 +125,9 @@ body: | bb.0: liveins: $q0, $q1 ; CHECK-LABEL: name: test_eve_v2p0 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(p0) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x p0>), [[C]](s64) ; CHECK-NEXT: $x0 = COPY [[EVEC]](p0) @@ -132,7 +144,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v4s64 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[C]](s64) ; CHECK-NEXT: $x0 = COPY [[EVEC]](s64) @@ -152,7 +166,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v2s1_unknown_idx - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<2 x s64>) = G_ICMP intpred(eq), [[COPY]](<2 x s64>), [[COPY1]] @@ -181,7 +197,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v4s1_unknown_idx - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), [[COPY]](<4 x s32>), [[COPY1]] @@ -211,7 +229,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v8s1_unknown_idx - ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s16>) = G_ICMP intpred(eq), [[COPY]](<8 x s16>), [[COPY1]] @@ -241,7 +261,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v16s1_unknown_idx - ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<16 x s8>) = G_ICMP intpred(eq), [[COPY]](<16 x s8>), [[COPY1]] @@ -271,7 +293,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v2p0_unknown_idx - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[COPY]](<2 x p0>) @@ -296,7 +320,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v4s64_unknown_idx - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 ; CHECK-NEXT: %idx:_(s64) = COPY $x0 ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 @@ -326,7 +352,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v8s32 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C]](s64) ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32) @@ -346,7 +374,9 @@ body: | bb.0: liveins: $q0, $q1, $x0 ; CHECK-LABEL: name: test_eve_v16s16 - ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q1 + ; CHECK: liveins: $q0, $q1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s16>), [[C]](s64) ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT [[EVEC]](s16) @@ -368,7 +398,9 @@ body: | bb.0: liveins: $x0 ; CHECK-LABEL: name: test_eve_v4p0 - ; CHECK: %vec:_(<4 x p0>) = G_IMPLICIT_DEF + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %vec:_(<4 x p0>) = G_IMPLICIT_DEF ; CHECK-NEXT: %idx:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x p0>), [[UV1:%[0-9]+]]:_(<2 x p0>) = G_UNMERGE_VALUES %vec(<4 x p0>) ; CHECK-NEXT: %eve:_(p0) = G_EXTRACT_VECTOR_ELT [[UV]](<2 x p0>), %idx(s64) @@ -386,7 +418,9 @@ body: | bb.0: liveins: $q0, $q1, $w0 ; CHECK-LABEL: name: test_eve_v8s32_unknown_idx - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 ; CHECK-NEXT: %idx:_(s32) = COPY $w0 ; CHECK-NEXT: %idxprom:_(s64) = G_SEXT %idx(s32) @@ -418,7 +452,9 @@ body: | bb.0: liveins: $q0, $q1, $w0 ; CHECK-LABEL: name: test_eve_v16s16_unknown_idx - ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: liveins: $q0, $q1, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 ; CHECK-NEXT: %idx:_(s32) = COPY $w0 ; CHECK-NEXT: %idxprom:_(s64) = G_SEXT %idx(s32) @@ -452,7 +488,9 @@ body: | bb.0: liveins: $x0 ; CHECK-LABEL: name: test_eve_v4p0_unknown_idx - ; CHECK: %vec:_(<4 x p0>) = G_IMPLICIT_DEF + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %vec:_(<4 x p0>) = G_IMPLICIT_DEF ; CHECK-NEXT: %idx:_(s64) = COPY $x0 ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x p0>), [[UV1:%[0-9]+]]:_(<2 x p0>) = G_UNMERGE_VALUES %vec(<4 x p0>) @@ -477,15 +515,17 @@ body: | RET_ReallyLR ... --- +# Make sure that the pointer legalization rules don't apply when we have +# different address spaces. name: cant_legalize_different_address_spaces body: | bb.0: liveins: $x0 - ; Make sure that the pointer legalization rules don't apply when we have - ; different address spaces. ; CHECK-LABEL: name: cant_legalize_different_address_spaces - ; CHECK: %vec:_(<4 x p1>) = G_IMPLICIT_DEF + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %vec:_(<4 x p1>) = G_IMPLICIT_DEF ; CHECK-NEXT: %idx:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: %eve:_(p0) = G_EXTRACT_VECTOR_ELT %vec(<4 x p1>), %idx(s64) ; CHECK-NEXT: $x0 = COPY %eve(p0) diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll new file mode 100644 index 0000000000000..d200b24428063 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/freeze.ll @@ -0,0 +1,382 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for freeze_v2i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for freeze_v3p0 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for freeze_v4p0 + +%struct.T = type { i32, i32 } + +define i32 @freeze_int() { +; CHECK-LABEL: freeze_int: +; CHECK: // %bb.0: +; CHECK-NEXT: mul w0, w8, w8 +; CHECK-NEXT: ret + %y1 = freeze i32 undef + %t1 = mul i32 %y1, %y1 + ret i32 %t1 +} + +define i5 @freeze_int2() { +; CHECK-LABEL: freeze_int2: +; CHECK: // %bb.0: +; CHECK-NEXT: mul w0, w8, w8 +; CHECK-NEXT: ret + %y1 = freeze i5 undef + %t1 = mul i5 %y1, %y1 + ret i5 %t1 +} + +define float @freeze_float() { +; CHECK-LABEL: freeze_float: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd s0, s0, s0 +; CHECK-NEXT: ret + %y1 = freeze float undef + %t1 = fadd float %y1, %y1 + ret float %t1 +} + +define <2 x i8> @freeze_v2i8() { +; CHECK-LABEL: freeze_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret + %y1 = freeze <2 x i8> undef + %t1 = add <2 x i8> %y1, %y1 + ret <2 x i8> %t1 +} + +define <3 x i8> @freeze_v3i8() { +; CHECK-SD-LABEL: freeze_v3i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: umov w0, v0.h[0] +; CHECK-SD-NEXT: umov w1, v0.h[1] +; CHECK-SD-NEXT: umov w2, v0.h[2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_v3i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov b0, v0.b[1] +; CHECK-GI-NEXT: mov b1, v0.b[2] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: umov w0, v0.h[0] +; CHECK-GI-NEXT: umov w1, v0.h[1] +; CHECK-GI-NEXT: umov w2, v0.h[2] +; CHECK-GI-NEXT: ret + %y1 = freeze <3 x i8> undef + %t1 = add <3 x i8> %y1, %y1 + ret <3 x i8> %t1 +} + +define <4 x i8> @freeze_v4i8() { +; CHECK-SD-LABEL: freeze_v4i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_v4i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov b0, v0.b[1] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov b1, v0.b[2] +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov b2, v0.b[3] +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: ret + %y1 = freeze <4 x i8> undef + %t1 = add <4 x i8> %y1, %y1 + ret <4 x i8> %t1 +} + +define <8 x i8> @freeze_v8i8() { +; CHECK-LABEL: freeze_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8b, v0.8b, v0.8b +; CHECK-NEXT: ret + %y1 = freeze <8 x i8> undef + %t1 = add <8 x i8> %y1, %y1 + ret <8 x i8> %t1 +} + +define <16 x i8> @freeze_v16i8() { +; CHECK-LABEL: freeze_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ret + %y1 = freeze <16 x i8> undef + %t1 = add <16 x i8> %y1, %y1 + ret <16 x i8> %t1 +} + +define <32 x i8> @freeze_v32i8() { +; CHECK-LABEL: freeze_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.16b, v0.16b, v0.16b +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ret + %y1 = freeze <32 x i8> undef + %t1 = add <32 x i8> %y1, %y1 + ret <32 x i8> %t1 +} + +define <2 x i16> @freeze_v2i16() { +; CHECK-SD-LABEL: freeze_v2i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_v2i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov h0, v0.h[1] +; CHECK-GI-NEXT: mov v1.s[0], w8 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: add v0.2s, v1.2s, v1.2s +; CHECK-GI-NEXT: ret + %y1 = freeze <2 x i16> undef + %t1 = add <2 x i16> %y1, %y1 + ret <2 x i16> %t1 +} + +define <3 x i16> @freeze_v3i16() { +; CHECK-LABEL: freeze_v3i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret + %y1 = freeze <3 x i16> undef + %t1 = add <3 x i16> %y1, %y1 + ret <3 x i16> %t1 +} + +define <4 x i16> @freeze_v4i16() { +; CHECK-LABEL: freeze_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret + %y1 = freeze <4 x i16> undef + %t1 = add <4 x i16> %y1, %y1 + ret <4 x i16> %t1 +} + +define <8 x i16> @freeze_v8i16() { +; CHECK-LABEL: freeze_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8h, v0.8h, v0.8h +; CHECK-NEXT: ret + %y1 = freeze <8 x i16> undef + %t1 = add <8 x i16> %y1, %y1 + ret <8 x i16> %t1 +} + +define <16 x i16> @freeze_v16i16() { +; CHECK-LABEL: freeze_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8h, v0.8h, v0.8h +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ret + %y1 = freeze <16 x i16> undef + %t1 = add <16 x i16> %y1, %y1 + ret <16 x i16> %t1 +} + +define <2 x i32> @freeze_v2i32() { +; CHECK-LABEL: freeze_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret + %y1 = freeze <2 x i32> undef + %t1 = add <2 x i32> %y1, %y1 + ret <2 x i32> %t1 +} + +define <3 x i32> @freeze_v3i32() { +; CHECK-LABEL: freeze_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4s, v0.4s, v0.4s +; CHECK-NEXT: ret + %y1 = freeze <3 x i32> undef + %t1 = add <3 x i32> %y1, %y1 + ret <3 x i32> %t1 +} + +define <4 x i32> @freeze_v4i32() { +; CHECK-LABEL: freeze_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4s, v0.4s, v0.4s +; CHECK-NEXT: ret + %y1 = freeze <4 x i32> undef + %t1 = add <4 x i32> %y1, %y1 + ret <4 x i32> %t1 +} + +define <8 x i32> @freeze_v8i32() { +; CHECK-LABEL: freeze_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4s, v0.4s, v0.4s +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ret + %y1 = freeze <8 x i32> undef + %t1 = add <8 x i32> %y1, %y1 + ret <8 x i32> %t1 +} + +define <2 x i64> @freeze_v2i64() { +; CHECK-LABEL: freeze_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-NEXT: ret + %y1 = freeze <2 x i64> undef + %t1 = add <2 x i64> %y1, %y1 + ret <2 x i64> %t1 +} + +define <3 x i64> @freeze_v3i64() { +; CHECK-SD-LABEL: freeze_v3i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-SD-NEXT: fmov d2, d0 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_v3i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-GI-NEXT: add x8, x8, x8 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret + %y1 = freeze <3 x i64> undef + %t1 = add <3 x i64> %y1, %y1 + ret <3 x i64> %t1 +} + +define <4 x i64> @freeze_v4i64() { +; CHECK-LABEL: freeze_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ret + %y1 = freeze <4 x i64> undef + %t1 = add <4 x i64> %y1, %y1 + ret <4 x i64> %t1 +} + +define <2 x ptr> @freeze_v2p0() { +; CHECK-SD-LABEL: freeze_v2p0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #4 // =0x4 +; CHECK-SD-NEXT: dup v0.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_v2p0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI21_0 +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI21_0] +; CHECK-GI-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-GI-NEXT: ret + %y1 = freeze <2 x ptr> undef + %t1 = getelementptr i32, <2 x ptr> %y1, i32 1 + ret <2 x ptr> %t1 +} + +define <3 x ptr> @freeze_v3p0() { +; CHECK-LABEL: freeze_v3p0: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4 // =0x4 +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-NEXT: add d2, d0, d2 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: ret + %y1 = freeze <3 x ptr> undef + %t1 = getelementptr i32, <3 x ptr> %y1, i32 1 + ret <3 x ptr> %t1 +} + +define <4 x ptr> @freeze_v4p0() { +; CHECK-LABEL: freeze_v4p0: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4 // =0x4 +; CHECK-NEXT: dup v0.2d, x8 +; CHECK-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ret + %y1 = freeze <4 x ptr> undef + %t1 = getelementptr i32, <4 x ptr> %y1, i32 1 + ret <4 x ptr> %t1 +} + +define ptr @freeze_ptr() { +; CHECK-LABEL: freeze_ptr: +; CHECK: // %bb.0: +; CHECK-NEXT: add x0, x8, #4 +; CHECK-NEXT: ret + %y1 = freeze ptr undef + %t1 = getelementptr i8, ptr %y1, i64 4 + ret ptr %t1 +} + +define i32 @freeze_struct() { +; CHECK-LABEL: freeze_struct: +; CHECK: // %bb.0: +; CHECK-NEXT: add w0, w8, w8 +; CHECK-NEXT: ret + %y1 = freeze %struct.T undef + %v1 = extractvalue %struct.T %y1, 0 + %v2 = extractvalue %struct.T %y1, 1 + %t1 = add i32 %v1, %v2 + ret i32 %t1 +} + +define i32 @freeze_anonstruct() { +; CHECK-LABEL: freeze_anonstruct: +; CHECK: // %bb.0: +; CHECK-NEXT: add w0, w8, w8 +; CHECK-NEXT: ret + %y1 = freeze {i32, i32} undef + %v1 = extractvalue {i32, i32} %y1, 0 + %v2 = extractvalue {i32, i32} %y1, 1 + %t1 = add i32 %v1, %v2 + ret i32 %t1 +} + +define i32 @freeze_anonstruct2() { +; CHECK-LABEL: freeze_anonstruct2: +; CHECK: // %bb.0: +; CHECK-NEXT: add w0, w8, w8, uxth +; CHECK-NEXT: ret + %y1 = freeze {i32, i16} undef + %v1 = extractvalue {i32, i16} %y1, 0 + %v2 = extractvalue {i32, i16} %y1, 1 + %z2 = zext i16 %v2 to i32 + %t1 = add i32 %v1, %z2 + ret i32 %t1 +} + +define i64 @freeze_array() { +; CHECK-LABEL: freeze_array: +; CHECK: // %bb.0: +; CHECK-NEXT: add x0, x8, x8 +; CHECK-NEXT: ret + %y1 = freeze [2 x i64] undef + %v1 = extractvalue [2 x i64] %y1, 0 + %v2 = extractvalue [2 x i64] %y1, 1 + %t1 = add i64 %v1, %v2 + ret i64 %t1 +} From 36d47f88786d29d381545a5f88a7964b47d9a595 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 18 Nov 2024 22:35:58 +0000 Subject: [PATCH 050/366] [AArch64][GlobalISel] Legalize ptr vector freeze and implicit defs. They can be treated the same as other s64 operations. --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 3 +- .../legalize-extract-vector-elt.mir | 12 ++--- llvm/test/CodeGen/AArch64/freeze.ll | 54 ++++++++++++------- 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index baa42302756a5..c8f01068f7218 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -103,7 +103,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampNumElements(0, v8s8, v16s8) .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v2s32, v4s32) - .clampNumElements(0, v2s64, v2s64); + .clampMaxNumElements(0, s64, 2) + .clampMaxNumElements(0, p0, 2); getActionDefinitionsBuilder(G_PHI) .legalFor({p0, s16, s32, s64}) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir index 29bd1f8feb5c4..b0b0e6b322a01 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir @@ -400,10 +400,9 @@ body: | ; CHECK-LABEL: name: test_eve_v4p0 ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %vec:_(<4 x p0>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x p0>) = G_IMPLICIT_DEF ; CHECK-NEXT: %idx:_(s64) = G_CONSTANT i64 1 - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x p0>), [[UV1:%[0-9]+]]:_(<2 x p0>) = G_UNMERGE_VALUES %vec(<4 x p0>) - ; CHECK-NEXT: %eve:_(p0) = G_EXTRACT_VECTOR_ELT [[UV]](<2 x p0>), %idx(s64) + ; CHECK-NEXT: %eve:_(p0) = G_EXTRACT_VECTOR_ELT [[DEF]](<2 x p0>), %idx(s64) ; CHECK-NEXT: $x0 = COPY %eve(p0) ; CHECK-NEXT: RET_ReallyLR %vec:_(<4 x p0>) = G_IMPLICIT_DEF @@ -490,15 +489,14 @@ body: | ; CHECK-LABEL: name: test_eve_v4p0_unknown_idx ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %vec:_(<4 x p0>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x p0>) = G_IMPLICIT_DEF ; CHECK-NEXT: %idx:_(s64) = COPY $x0 ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x p0>), [[UV1:%[0-9]+]]:_(<2 x p0>) = G_UNMERGE_VALUES %vec(<4 x p0>) - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[UV]](<2 x p0>) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[DEF]](<2 x p0>) ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), [[FRAME_INDEX]](p0) :: (store (<2 x s64>) into %stack.0, align 32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s64) - ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[UV1]](<2 x p0>) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[DEF]](<2 x p0>) ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into %stack.0 + 16, basealign 32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND %idx, [[C1]] diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll index d200b24428063..6efd9f40f0068 100644 --- a/llvm/test/CodeGen/AArch64/freeze.ll +++ b/llvm/test/CodeGen/AArch64/freeze.ll @@ -3,8 +3,6 @@ ; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; CHECK-GI: warning: Instruction selection used fallback path for freeze_v2i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for freeze_v3p0 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for freeze_v4p0 %struct.T = type { i32, i32 } @@ -294,29 +292,47 @@ define <2 x ptr> @freeze_v2p0() { } define <3 x ptr> @freeze_v3p0() { -; CHECK-LABEL: freeze_v3p0: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #4 // =0x4 -; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: add v0.2d, v0.2d, v2.2d -; CHECK-NEXT: add d2, d0, d2 -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: freeze_v3p0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #4 // =0x4 +; CHECK-SD-NEXT: dup v2.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: add d2, d0, d2 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_v3p0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI22_0 +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI22_0] +; CHECK-GI-NEXT: add x8, x8, #4 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: ret %y1 = freeze <3 x ptr> undef %t1 = getelementptr i32, <3 x ptr> %y1, i32 1 ret <3 x ptr> %t1 } define <4 x ptr> @freeze_v4p0() { -; CHECK-LABEL: freeze_v4p0: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #4 // =0x4 -; CHECK-NEXT: dup v0.2d, x8 -; CHECK-NEXT: add v0.2d, v0.2d, v0.2d -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: freeze_v4p0: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #4 // =0x4 +; CHECK-SD-NEXT: dup v0.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_v4p0: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI23_0 +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI23_0] +; CHECK-GI-NEXT: add v0.2d, v0.2d, v0.2d +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ret %y1 = freeze <4 x ptr> undef %t1 = getelementptr i32, <4 x ptr> %y1, i32 1 ret <4 x ptr> %t1 From 1ced56540071476d0a4aa8cb5134106d02b5b7f1 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 18 Nov 2024 16:43:33 -0600 Subject: [PATCH 051/366] [Clang] Add support for scoped atomic thread fence (#115545) Summary: Previously we added support for all of the atomic GNU extensions with optional memory scoped except for `__atomic_thread_fence`. This patch adds support for that. This should ideally allow us to generically emit these LLVM scopes. --- clang/include/clang/Basic/Builtins.td | 6 + clang/lib/CodeGen/CGBuiltin.cpp | 130 +++++++++++++ clang/test/CodeGen/scoped-fence-ops.c | 257 ++++++++++++++++++++++++++ 3 files changed, 393 insertions(+) create mode 100644 clang/test/CodeGen/scoped-fence-ops.c diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index f5124f4633364..191dfa1dd2c77 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -1995,6 +1995,12 @@ def AtomicThreadFence : Builtin { let Prototype = "void(int)"; } +def ScopedAtomicThreadFence : Builtin { + let Spellings = ["__scoped_atomic_thread_fence"]; + let Attributes = [NoThrow]; + let Prototype = "void(int, int)"; +} + def AtomicSignalFence : Builtin { let Spellings = ["__atomic_signal_fence"]; let Attributes = [NoThrow]; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index df69d188306be..0916e14f182dd 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5213,6 +5213,136 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Builder.SetInsertPoint(ContBB); return RValue::get(nullptr); } + case Builtin::BI__scoped_atomic_thread_fence: { + auto ScopeModel = AtomicScopeModel::create(AtomicScopeModelKind::Generic); + + Value *Order = EmitScalarExpr(E->getArg(0)); + Value *Scope = EmitScalarExpr(E->getArg(1)); + auto Ord = dyn_cast(Order); + auto Scp = dyn_cast(Scope); + if (Ord && Scp) { + SyncScope SS = ScopeModel->isValid(Scp->getZExtValue()) + ? ScopeModel->map(Scp->getZExtValue()) + : ScopeModel->map(ScopeModel->getFallBackValue()); + switch (Ord->getZExtValue()) { + case 0: // memory_order_relaxed + default: // invalid order + break; + case 1: // memory_order_consume + case 2: // memory_order_acquire + Builder.CreateFence( + llvm::AtomicOrdering::Acquire, + getTargetHooks().getLLVMSyncScopeID(getLangOpts(), SS, + llvm::AtomicOrdering::Acquire, + getLLVMContext())); + break; + case 3: // memory_order_release + Builder.CreateFence( + llvm::AtomicOrdering::Release, + getTargetHooks().getLLVMSyncScopeID(getLangOpts(), SS, + llvm::AtomicOrdering::Release, + getLLVMContext())); + break; + case 4: // memory_order_acq_rel + Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, + getTargetHooks().getLLVMSyncScopeID( + getLangOpts(), SS, + llvm::AtomicOrdering::AcquireRelease, + getLLVMContext())); + break; + case 5: // memory_order_seq_cst + Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, + getTargetHooks().getLLVMSyncScopeID( + getLangOpts(), SS, + llvm::AtomicOrdering::SequentiallyConsistent, + getLLVMContext())); + break; + } + return RValue::get(nullptr); + } + + llvm::BasicBlock *ContBB = createBasicBlock("atomic.scope.continue", CurFn); + + llvm::SmallVector> + OrderBBs; + if (Ord) { + switch (Ord->getZExtValue()) { + case 0: // memory_order_relaxed + default: // invalid order + ContBB->eraseFromParent(); + return RValue::get(nullptr); + case 1: // memory_order_consume + case 2: // memory_order_acquire + OrderBBs.emplace_back(Builder.GetInsertBlock(), + llvm::AtomicOrdering::Acquire); + break; + case 3: // memory_order_release + OrderBBs.emplace_back(Builder.GetInsertBlock(), + llvm::AtomicOrdering::Release); + break; + case 4: // memory_order_acq_rel + OrderBBs.emplace_back(Builder.GetInsertBlock(), + llvm::AtomicOrdering::AcquireRelease); + break; + case 5: // memory_order_seq_cst + OrderBBs.emplace_back(Builder.GetInsertBlock(), + llvm::AtomicOrdering::SequentiallyConsistent); + break; + } + } else { + llvm::BasicBlock *AcquireBB = createBasicBlock("acquire", CurFn); + llvm::BasicBlock *ReleaseBB = createBasicBlock("release", CurFn); + llvm::BasicBlock *AcqRelBB = createBasicBlock("acqrel", CurFn); + llvm::BasicBlock *SeqCstBB = createBasicBlock("seqcst", CurFn); + + Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false); + llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB); + SI->addCase(Builder.getInt32(1), AcquireBB); + SI->addCase(Builder.getInt32(2), AcquireBB); + SI->addCase(Builder.getInt32(3), ReleaseBB); + SI->addCase(Builder.getInt32(4), AcqRelBB); + SI->addCase(Builder.getInt32(5), SeqCstBB); + + OrderBBs.emplace_back(AcquireBB, llvm::AtomicOrdering::Acquire); + OrderBBs.emplace_back(ReleaseBB, llvm::AtomicOrdering::Release); + OrderBBs.emplace_back(AcqRelBB, llvm::AtomicOrdering::AcquireRelease); + OrderBBs.emplace_back(SeqCstBB, + llvm::AtomicOrdering::SequentiallyConsistent); + } + + for (auto &[OrderBB, Ordering] : OrderBBs) { + Builder.SetInsertPoint(OrderBB); + if (Scp) { + SyncScope SS = ScopeModel->isValid(Scp->getZExtValue()) + ? ScopeModel->map(Scp->getZExtValue()) + : ScopeModel->map(ScopeModel->getFallBackValue()); + Builder.CreateFence(Ordering, + getTargetHooks().getLLVMSyncScopeID( + getLangOpts(), SS, Ordering, getLLVMContext())); + Builder.CreateBr(ContBB); + } else { + llvm::DenseMap BBs; + for (unsigned Scp : ScopeModel->getRuntimeValues()) + BBs[Scp] = createBasicBlock(getAsString(ScopeModel->map(Scp)), CurFn); + + auto *SC = Builder.CreateIntCast(Scope, Builder.getInt32Ty(), false); + llvm::SwitchInst *SI = Builder.CreateSwitch(SC, ContBB); + for (unsigned Scp : ScopeModel->getRuntimeValues()) { + auto *B = BBs[Scp]; + SI->addCase(Builder.getInt32(Scp), B); + + Builder.SetInsertPoint(B); + Builder.CreateFence(Ordering, getTargetHooks().getLLVMSyncScopeID( + getLangOpts(), ScopeModel->map(Scp), + Ordering, getLLVMContext())); + Builder.CreateBr(ContBB); + } + } + } + + Builder.SetInsertPoint(ContBB); + return RValue::get(nullptr); + } case Builtin::BI__builtin_signbit: case Builtin::BI__builtin_signbitf: diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c new file mode 100644 index 0000000000000..376cb11e84d3d --- /dev/null +++ b/clang/test/CodeGen/scoped-fence-ops.c @@ -0,0 +1,257 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \ +// RUN: -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s +// RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \ +// RUN: -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s +// RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-unknown-linux-gnu -ffreestanding \ +// RUN: -fvisibility=hidden | FileCheck --check-prefix=X86_64 %s + +// AMDGCN-LABEL: define hidden void @fe1a( +// AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: fence syncscope("workgroup-one-as") release +// AMDGCN-NEXT: ret void +// +// SPIRV-LABEL: define hidden spir_func void @fe1a( +// SPIRV-SAME: ) #[[ATTR0:[0-9]+]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: fence syncscope("workgroup") release +// SPIRV-NEXT: ret void +// +// X86_64-LABEL: define hidden void @fe1a( +// X86_64-SAME: ) #[[ATTR0:[0-9]+]] { +// X86_64-NEXT: [[ENTRY:.*:]] +// X86_64-NEXT: fence release +// X86_64-NEXT: ret void +// +void fe1a() { + __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WRKGRP); +} + +// AMDGCN-LABEL: define hidden void @fe1b( +// AMDGCN-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[ORD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORD_ADDR]] to ptr +// AMDGCN-NEXT: store i32 [[ORD]], ptr [[ORD_ADDR_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR_ASCAST]], align 4 +// AMDGCN-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ +// AMDGCN-NEXT: i32 1, label %[[ACQUIRE:.*]] +// AMDGCN-NEXT: i32 2, label %[[ACQUIRE]] +// AMDGCN-NEXT: i32 3, label %[[RELEASE:.*]] +// AMDGCN-NEXT: i32 4, label %[[ACQREL:.*]] +// AMDGCN-NEXT: i32 5, label %[[SEQCST:.*]] +// AMDGCN-NEXT: ] +// AMDGCN: [[ATOMIC_SCOPE_CONTINUE]]: +// AMDGCN-NEXT: ret void +// AMDGCN: [[ACQUIRE]]: +// AMDGCN-NEXT: fence syncscope("workgroup-one-as") acquire +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[RELEASE]]: +// AMDGCN-NEXT: fence syncscope("workgroup-one-as") release +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[ACQREL]]: +// AMDGCN-NEXT: fence syncscope("workgroup-one-as") acq_rel +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[SEQCST]]: +// AMDGCN-NEXT: fence syncscope("workgroup") seq_cst +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// +// SPIRV-LABEL: define hidden spir_func void @fe1b( +// SPIRV-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[ORD_ADDR:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store i32 [[ORD]], ptr [[ORD_ADDR]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR]], align 4 +// SPIRV-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ +// SPIRV-NEXT: i32 1, label %[[ACQUIRE:.*]] +// SPIRV-NEXT: i32 2, label %[[ACQUIRE]] +// SPIRV-NEXT: i32 3, label %[[RELEASE:.*]] +// SPIRV-NEXT: i32 4, label %[[ACQREL:.*]] +// SPIRV-NEXT: i32 5, label %[[SEQCST:.*]] +// SPIRV-NEXT: ] +// SPIRV: [[ATOMIC_SCOPE_CONTINUE]]: +// SPIRV-NEXT: ret void +// SPIRV: [[ACQUIRE]]: +// SPIRV-NEXT: fence syncscope("workgroup") acquire +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[RELEASE]]: +// SPIRV-NEXT: fence syncscope("workgroup") release +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[ACQREL]]: +// SPIRV-NEXT: fence syncscope("workgroup") acq_rel +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[SEQCST]]: +// SPIRV-NEXT: fence syncscope("workgroup") seq_cst +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// +// X86_64-LABEL: define hidden void @fe1b( +// X86_64-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] { +// X86_64-NEXT: [[ENTRY:.*:]] +// X86_64-NEXT: [[ORD_ADDR:%.*]] = alloca i32, align 4 +// X86_64-NEXT: store i32 [[ORD]], ptr [[ORD_ADDR]], align 4 +// X86_64-NEXT: [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR]], align 4 +// X86_64-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ +// X86_64-NEXT: i32 1, label %[[ACQUIRE:.*]] +// X86_64-NEXT: i32 2, label %[[ACQUIRE]] +// X86_64-NEXT: i32 3, label %[[RELEASE:.*]] +// X86_64-NEXT: i32 4, label %[[ACQREL:.*]] +// X86_64-NEXT: i32 5, label %[[SEQCST:.*]] +// X86_64-NEXT: ] +// X86_64: [[ATOMIC_SCOPE_CONTINUE]]: +// X86_64-NEXT: ret void +// X86_64: [[ACQUIRE]]: +// X86_64-NEXT: fence acquire +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// X86_64: [[RELEASE]]: +// X86_64-NEXT: fence release +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// X86_64: [[ACQREL]]: +// X86_64-NEXT: fence acq_rel +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// X86_64: [[SEQCST]]: +// X86_64-NEXT: fence seq_cst +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// +void fe1b(int ord) { + __scoped_atomic_thread_fence(ord, __MEMORY_SCOPE_WRKGRP); +} + +// AMDGCN-LABEL: define hidden void @fe1c( +// AMDGCN-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[SCOPE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCOPE_ADDR]] to ptr +// AMDGCN-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4 +// AMDGCN-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ +// AMDGCN-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] +// AMDGCN-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]] +// AMDGCN-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]] +// AMDGCN-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]] +// AMDGCN-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]] +// AMDGCN-NEXT: ] +// AMDGCN: [[ATOMIC_SCOPE_CONTINUE]]: +// AMDGCN-NEXT: ret void +// AMDGCN: [[DEVICE_SCOPE]]: +// AMDGCN-NEXT: fence syncscope("agent-one-as") release +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[SYSTEM_SCOPE]]: +// AMDGCN-NEXT: fence syncscope("one-as") release +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[WORKGROUP_SCOPE]]: +// AMDGCN-NEXT: fence syncscope("workgroup-one-as") release +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[WAVEFRONT_SCOPE]]: +// AMDGCN-NEXT: fence syncscope("wavefront-one-as") release +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[SINGLE_SCOPE]]: +// AMDGCN-NEXT: fence syncscope("singlethread-one-as") release +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// +// SPIRV-LABEL: define hidden spir_func void @fe1c( +// SPIRV-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[SCOPE_ADDR:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4 +// SPIRV-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ +// SPIRV-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] +// SPIRV-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]] +// SPIRV-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]] +// SPIRV-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]] +// SPIRV-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]] +// SPIRV-NEXT: ] +// SPIRV: [[ATOMIC_SCOPE_CONTINUE]]: +// SPIRV-NEXT: ret void +// SPIRV: [[DEVICE_SCOPE]]: +// SPIRV-NEXT: fence syncscope("device") release +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[SYSTEM_SCOPE]]: +// SPIRV-NEXT: fence release +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[WORKGROUP_SCOPE]]: +// SPIRV-NEXT: fence syncscope("workgroup") release +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[WAVEFRONT_SCOPE]]: +// SPIRV-NEXT: fence syncscope("subgroup") release +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[SINGLE_SCOPE]]: +// SPIRV-NEXT: fence syncscope("singlethread") release +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// +// X86_64-LABEL: define hidden void @fe1c( +// X86_64-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] { +// X86_64-NEXT: [[ENTRY:.*:]] +// X86_64-NEXT: [[SCOPE_ADDR:%.*]] = alloca i32, align 4 +// X86_64-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4 +// X86_64-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4 +// X86_64-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ +// X86_64-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] +// X86_64-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]] +// X86_64-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]] +// X86_64-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]] +// X86_64-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]] +// X86_64-NEXT: ] +// X86_64: [[ATOMIC_SCOPE_CONTINUE]]: +// X86_64-NEXT: ret void +// X86_64: [[DEVICE_SCOPE]]: +// X86_64-NEXT: fence release +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// X86_64: [[SYSTEM_SCOPE]]: +// X86_64-NEXT: fence release +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// X86_64: [[WORKGROUP_SCOPE]]: +// X86_64-NEXT: fence release +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// X86_64: [[WAVEFRONT_SCOPE]]: +// X86_64-NEXT: fence release +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// X86_64: [[SINGLE_SCOPE]]: +// X86_64-NEXT: fence release +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// +void fe1c(int scope) { + __scoped_atomic_thread_fence(__ATOMIC_RELEASE, scope); +} + +// AMDGCN-LABEL: define hidden void @fe2a( +// AMDGCN-SAME: ) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: ret void +// +// SPIRV-LABEL: define hidden spir_func void @fe2a( +// SPIRV-SAME: ) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: ret void +// +// X86_64-LABEL: define hidden void @fe2a( +// X86_64-SAME: ) #[[ATTR0]] { +// X86_64-NEXT: [[ENTRY:.*:]] +// X86_64-NEXT: ret void +// +void fe2a() { + __scoped_atomic_thread_fence(999, __MEMORY_SCOPE_SYSTEM); +} + +// AMDGCN-LABEL: define hidden void @fe2b( +// AMDGCN-SAME: ) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: fence syncscope("one-as") release +// AMDGCN-NEXT: ret void +// +// SPIRV-LABEL: define hidden spir_func void @fe2b( +// SPIRV-SAME: ) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: fence release +// SPIRV-NEXT: ret void +// +// X86_64-LABEL: define hidden void @fe2b( +// X86_64-SAME: ) #[[ATTR0]] { +// X86_64-NEXT: [[ENTRY:.*:]] +// X86_64-NEXT: fence release +// X86_64-NEXT: ret void +// +void fe2b() { + __scoped_atomic_thread_fence(__ATOMIC_RELEASE, 999); +} From 94d100f2ba81c2bf0ef495f68d66ba8c94c71d2a Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Mon, 18 Nov 2024 14:45:54 -0800 Subject: [PATCH 052/366] [lldb][dwarf] Compute fully qualified names on simplified template names with DWARFTypePrinter (#112811) This is the second half of https://github.com/llvm/llvm-project/pull/90008. Essentially, it replaces the work of resolving template types when we just need the qualified names with walking the DIE tree using `DWARFTypePrinter`. ### Result For an internal target, the time spent on `expr *this` for the first time reduced from 28 secs to 17 secs. --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 21 ++- .../Plugins/SymbolFile/DWARF/DWARFBaseDIE.h | 8 ++ .../Plugins/SymbolFile/DWARF/DWARFDIE.cpp | 37 ++++++ .../Plugins/SymbolFile/DWARF/DWARFDIE.h | 17 +++ .../SymbolFile/DWARF/DWARFFormValue.cpp | 25 ++++ .../Plugins/SymbolFile/DWARF/DWARFFormValue.h | 6 + .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 36 ++--- .../TypeSystem/Clang/TypeSystemClang.cpp | 20 --- .../TypeSystem/Clang/TypeSystemClang.h | 4 - .../DWARF/x86/simplified-template-names.cpp | 36 +++++ .../SymbolFile/DWARF/DWARFDIETest.cpp | 125 ++++++++++++++++++ llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h | 2 + .../llvm/DebugInfo/DWARF/DWARFTypePrinter.h | 33 +++-- llvm/lib/DebugInfo/DWARF/DWARFDie.cpp | 9 ++ 14 files changed, 309 insertions(+), 70 deletions(-) create mode 100644 lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index d9bdeb560e122..37c1132c1c9f9 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -45,6 +45,7 @@ #include "clang/AST/Type.h" #include "clang/Basic/Specifiers.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h" #include "llvm/Demangle/Demangle.h" #include @@ -826,11 +827,11 @@ std::string DWARFASTParserClang::GetDIEClassTemplateParams(DWARFDIE die) { if (llvm::StringRef(die.GetName()).contains("<")) return {}; - TypeSystemClang::TemplateParameterInfos template_param_infos; - if (ParseTemplateParameterInfos(die, template_param_infos)) - return m_ast.PrintTemplateParams(template_param_infos); - - return {}; + std::string name; + llvm::raw_string_ostream os(name); + llvm::DWARFTypePrinter type_printer(os); + type_printer.appendAndTerminateTemplateParameters(die); + return name; } void DWARFASTParserClang::MapDeclDIEToDefDIE( @@ -1618,9 +1619,9 @@ void DWARFASTParserClang::GetUniqueTypeNameAndDeclaration( case DW_TAG_structure_type: case DW_TAG_union_type: { if (const char *class_union_struct_name = parent_decl_ctx_die.GetName()) { - qualified_name.insert( - 0, GetDIEClassTemplateParams(parent_decl_ctx_die)); qualified_name.insert(0, "::"); + qualified_name.insert(0, + GetDIEClassTemplateParams(parent_decl_ctx_die)); qualified_name.insert(0, class_union_struct_name); } parent_decl_ctx_die = parent_decl_ctx_die.GetParentDeclContextDIE(); @@ -1673,6 +1674,12 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, if (attrs.name) { GetUniqueTypeNameAndDeclaration(die, cu_language, unique_typename, unique_decl); + if (log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, "SymbolFileDWARF({0:p}) - {1:x16}: {2} has unique name: {3} ", + static_cast(this), die.GetID(), DW_TAG_value_to_name(tag), + unique_typename.AsCString()); + } if (UniqueDWARFASTType *unique_ast_entry_type = dwarf->GetUniqueDWARFASTTypeMap().Find( unique_typename, die, unique_decl, byte_size, diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h index 235343d227122..d92de658a49e8 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h @@ -24,9 +24,11 @@ class DWARFUnit; class DWARFDebugInfoEntry; class DWARFDeclContext; class SymbolFileDWARF; +class DWARFFormValue; class DWARFBaseDIE { public: + using DWARFFormValue = dwarf::DWARFFormValue; DWARFBaseDIE() = default; DWARFBaseDIE(DWARFUnit *cu, DWARFDebugInfoEntry *die) @@ -117,6 +119,12 @@ class DWARFBaseDIE { enum class Recurse : bool { no, yes }; DWARFAttributes GetAttributes(Recurse recurse = Recurse::yes) const; + // The following methods use LLVM naming convension in order to be are used by + // LLVM libraries. + dw_tag_t getTag() const { return Tag(); } + + const char *getShortName() const { return GetName(); } + protected: DWARFUnit *m_cu = nullptr; DWARFDebugInfoEntry *m_die = nullptr; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp index 4c9f1d8505f6e..362f4c44240c7 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp @@ -572,6 +572,43 @@ bool DWARFDIE::GetDIENamesAndRanges( return false; } +// The following methods use LLVM naming convension in order to be are used by +// LLVM libraries. llvm::iterator_range DWARFDIE::children() const { return llvm::make_range(child_iterator(*this), child_iterator()); } + +DWARFDIE::child_iterator DWARFDIE::begin() const { + return child_iterator(*this); +} + +DWARFDIE::child_iterator DWARFDIE::end() const { return child_iterator(); } + +std::optional DWARFDIE::find(const dw_attr_t attr) const { + DWARFFormValue form_value; + if (m_die->GetAttributeValue(m_cu, attr, form_value, nullptr, false)) + return form_value; + return std::nullopt; +} + +std::optional DWARFDIE::getLanguage() const { + if (IsValid()) + return m_cu->GetDWARFLanguageType(); + return std::nullopt; +} + +DWARFDIE DWARFDIE::resolveReferencedType(dw_attr_t attr) const { + return GetReferencedDIE(attr); +} + +DWARFDIE DWARFDIE::resolveReferencedType(DWARFFormValue v) const { + if (IsValid()) + return v.Reference(); + return {}; +} + +DWARFDIE DWARFDIE::resolveTypeUnitReference() const { + if (DWARFDIE reference = GetReferencedDIE(DW_AT_signature)) + return reference; + return *this; +} diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h index 077b78eb26d0c..5c1d381930c4e 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h @@ -103,8 +103,25 @@ class DWARFDIE : public DWARFBaseDIE { std::optional &call_line, std::optional &call_column, DWARFExpressionList *frame_base) const; + // The following methods use LLVM naming convension in order to be are used by + // LLVM libraries. + std::optional getLanguage() const; + + DWARFDIE getParent() const { return GetParent(); } + + DWARFDIE resolveReferencedType(dw_attr_t attr) const; + + DWARFDIE resolveReferencedType(DWARFFormValue v) const; + + DWARFDIE resolveTypeUnitReference() const; + + std::optional find(const dw_attr_t attr) const; + /// The range of all the children of this DIE. llvm::iterator_range children() const; + + child_iterator begin() const; + child_iterator end() const; }; class DWARFDIE::child_iterator diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp index 404e50d57a925..fd3d45cef4c5e 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp @@ -574,6 +574,31 @@ uint64_t DWARFFormValue::Reference(dw_offset_t base_offset) const { } } +std::optional DWARFFormValue::getAsUnsignedConstant() const { + if ((!IsDataForm(m_form)) || m_form == lldb_private::dwarf::DW_FORM_sdata) + return std::nullopt; + return m_value.uval; +} + +std::optional DWARFFormValue::getAsSignedConstant() const { + if ((!IsDataForm(m_form)) || + (m_form == lldb_private::dwarf::DW_FORM_udata && + uint64_t(std::numeric_limits::max()) < m_value.uval)) + return std::nullopt; + switch (m_form) { + case lldb_private::dwarf::DW_FORM_data4: + return int32_t(m_value.uval); + case lldb_private::dwarf::DW_FORM_data2: + return int16_t(m_value.uval); + case lldb_private::dwarf::DW_FORM_data1: + return int8_t(m_value.uval); + case lldb_private::dwarf::DW_FORM_sdata: + case lldb_private::dwarf::DW_FORM_data8: + default: + return m_value.sval; + } +} + const uint8_t *DWARFFormValue::BlockData() const { return m_value.data; } bool DWARFFormValue::IsBlockForm(const dw_form_t form) { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h index 8ab9163e645fe..613948f2f3c9b 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h @@ -76,6 +76,12 @@ class DWARFFormValue { void Clear(); static bool FormIsSupported(dw_form_t form); + // The following methods use LLVM naming convension in order to be are used by + // LLVM libraries. + std::optional getAsUnsignedConstant() const; + std::optional getAsSignedConstant() const; + const char *getAsCString() const { return AsCString(); } + protected: // Compile unit where m_value was located. // It may be different from compile unit where m_value refers to. diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 8ce0db4588a46..47050d86409a6 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -9,6 +9,7 @@ #include "SymbolFileDWARF.h" #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h" +#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/Format.h" @@ -2810,33 +2811,14 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) { return true; // Keep iterating over index types, language mismatch. } - // Check the context matches - std::vector die_context; - if (query.GetModuleSearch()) - die_context = die.GetDeclContext(); - else - die_context = die.GetTypeLookupContext(); - assert(!die_context.empty()); - if (!query_simple.ContextMatches(die_context)) - return true; // Keep iterating over index types, context mismatch. - - // Try to resolve the type. - if (Type *matching_type = ResolveType(die, true, true)) { - ConstString name = matching_type->GetQualifiedName(); - // We have found a type that still might not match due to template - // parameters. If we create a new TypeQuery that uses the new type's - // fully qualified name, we can find out if this type matches at all - // context levels. We can't use just the "match_simple" context - // because all template parameters were stripped off. The fully - // qualified name of the type will have the template parameters and - // will allow us to make sure it matches correctly. - TypeQuery die_query(name.GetStringRef(), - TypeQueryOptions::e_exact_match); - if (!query.ContextMatches(die_query.GetContextRef())) - return true; // Keep iterating over index types, context mismatch. - - results.InsertUnique(matching_type->shared_from_this()); - } + std::string qualified_name; + llvm::raw_string_ostream os(qualified_name); + llvm::DWARFTypePrinter type_printer(os); + type_printer.appendQualifiedName(die); + TypeQuery die_query(qualified_name, e_exact_match); + if (query.ContextMatches(die_query.GetContextRef())) + if (Type *matching_type = ResolveType(die, true, true)) + results.InsertUnique(matching_type->shared_from_this()); return !results.Done(query); // Keep iterating if we aren't done. }); if (results.Done(query)) { diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 1a77c7cf9161a..5f8163211857c 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -1403,26 +1403,6 @@ static TemplateParameterList *CreateTemplateParameterList( return template_param_list; } -std::string TypeSystemClang::PrintTemplateParams( - const TemplateParameterInfos &template_param_infos) { - llvm::SmallVector ignore; - clang::TemplateParameterList *template_param_list = - CreateTemplateParameterList(getASTContext(), template_param_infos, - ignore); - llvm::SmallVector args( - template_param_infos.GetArgs()); - if (template_param_infos.hasParameterPack()) { - llvm::ArrayRef pack_args = - template_param_infos.GetParameterPackArgs(); - args.append(pack_args.begin(), pack_args.end()); - } - std::string str; - llvm::raw_string_ostream os(str); - clang::printTemplateArgumentList(os, args, GetTypePrintingPolicy(), - template_param_list); - return str; -} - clang::FunctionTemplateDecl *TypeSystemClang::CreateFunctionTemplateDecl( clang::DeclContext *decl_ctx, OptionalClangModuleID owning_module, clang::FunctionDecl *func_decl, diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h index e39aedec7e390..678eaed381fd4 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h @@ -1148,10 +1148,6 @@ class TypeSystemClang : public TypeSystem { bool SetDeclIsForcefullyCompleted(const clang::TagDecl *td); - /// Return the template parameters (including surrounding <>) in string form. - std::string - PrintTemplateParams(const TemplateParameterInfos &template_param_infos); - private: /// Returns the PrintingPolicy used when generating the internal type names. /// These type names are mostly used for the formatter selection. diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp new file mode 100644 index 0000000000000..bc34b16607cf9 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp @@ -0,0 +1,36 @@ +// Test lldb is able to compute the fully qualified names on templates with +// -gsimple-template-names and -fdebug-types-section. + +// REQUIRES: lld + +// Test against logging to see if we print the fully qualified names correctly. +// RUN: %clangxx --target=x86_64-pc-linux -g -gsimple-template-names %s -o %t +// RUN: %lldb %t -o "log enable dwarf comp" -o "target variable v3" -o exit | FileCheck %s --check-prefix=LOG + +// Test that we following DW_AT_signature correctly. If not, lldb might confuse the types of v1 and v2. +// RUN: %clangxx --target=x86_64-pc-linux -g -gsimple-template-names -fdebug-types-section %s -o %t +// RUN: %lldb %t -o "target variable v1 v2" -o exit | FileCheck %s --check-prefix=TYPE + +// LOG: unique name: t3 >::t4 + +// TYPE: (t2 >) v1 = {} +// TYPE-NEXT: (t2 >) v2 = {} + +struct outer_struct1 { + template struct t1 {}; +}; + +struct outer_struct2 { + template struct t1 {}; +}; + +template struct t2 {}; +t2> v1; +t2> v2; + +template struct t3 { + struct t4 {}; +}; +t3>::t4 v3; + +int main() {} diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp index 1e4c8f3ba0778..ae63e286cc155 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp @@ -14,6 +14,7 @@ #include "lldb/Symbol/Type.h" #include "lldb/lldb-private-enumerations.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -394,3 +395,127 @@ TEST(DWARFDIETest, GetContextInFunction) { EXPECT_THAT(foo_struct_die.GetTypeLookupContext(), testing::ElementsAre(make_struct("struct_t"))); } + +TEST(DWARFDIETest, TestDWARFTypePrinter) { + // Make sure we can get template parameters and qualified names correctly with + // DWARFTypePrinter when using -gsimple-template-names. + + // 0x0000000b: DW_TAG_compile_unit + // 0x0000000c: DW_TAG_base_type + // DW_AT_name ("int") + // 0x00000011: DW_TAG_structure_type + // DW_AT_name ("t1") + // 0x00000015: DW_TAG_template_type_parameter + // DW_AT_type (0x0000001f "t3") + // 0x0000001a: DW_TAG_structure_type + // DW_AT_name ("t2") + // 0x0000001e: NULL + // 0x0000001f: DW_TAG_structure_type + // DW_AT_name ("t3") + // 0x00000023: DW_TAG_template_type_parameter + // DW_AT_type (0x0000000c "int") + // 0x00000028: NULL + // 0x00000029: NULL + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_386 +DWARF: + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + - Code: 0x2 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Code: 0x3 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Code: 0x4 + Tag: DW_TAG_template_type_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x5 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Code: 0x6 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Code: 0x7 + Tag: DW_TAG_template_type_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + - AbbrCode: 0x2 + Values: + - Value: 0xDEADBEEFDEADBEEF + CStr: int + - AbbrCode: 0x3 + Values: + - Value: 0xDEADBEEFDEADBEEF + CStr: t1 + - AbbrCode: 0x4 + Values: + - Value: 0x0000001f # update + - AbbrCode: 0x5 + Values: + - Value: 0xDEADBEEFDEADBEEF + CStr: t2 + - AbbrCode: 0x0 + - AbbrCode: 0x6 + Values: + - Value: 0xDEADBEEFDEADBEEF + CStr: t3 + - AbbrCode: 0x7 + Values: + - Value: 0x0000000c # update + - AbbrCode: 0x0 + - AbbrCode: 0x0)"; + YAMLModuleTester t(yamldata); + auto *symbol_file = + llvm::cast(t.GetModule()->GetSymbolFile()); + DWARFUnit *unit = symbol_file->DebugInfo().GetUnitAtIndex(0); + std::string debug_str; + StreamString debug_os; + unit->Dump(&debug_os); + ASSERT_TRUE(unit); + + DWARFDIE t1_die = unit->GetDIE(0x11); + std::string template_name; + llvm::raw_string_ostream template_name_os(template_name); + llvm::DWARFTypePrinter template_name_printer(template_name_os); + template_name_printer.appendAndTerminateTemplateParameters(t1_die); + EXPECT_THAT(template_name, " >"); + + DWARFDIE t2_die = unit->GetDIE(0x1a); + std::string qualified_name; + llvm::raw_string_ostream qualified_name_os(qualified_name); + llvm::DWARFTypePrinter qualified_name_printer(qualified_name_os); + qualified_name_printer.appendQualifiedName(t2_die); + EXPECT_THAT(qualified_name, "t1 >::t2"); +} diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h index 69c91835a4d9a..2e98a4a397147 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h @@ -226,6 +226,8 @@ class DWARFDie { bool addressRangeContainsAddress(const uint64_t Address) const; + std::optional getLanguage() const; + Expected getLocations(dwarf::Attribute Attr) const; diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h index 87e876273c4b9..962462b827825 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h @@ -11,6 +11,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/Support/Error.h" #include @@ -107,13 +108,11 @@ void DWARFTypePrinter::appendArrayType(const DieType &D) { if (std::optional UpperV = C.find(dwarf::DW_AT_upper_bound)) UB = UpperV->getAsUnsignedConstant(); - if (std::optional LV = - D.getDwarfUnit()->getUnitDIE().find(dwarf::DW_AT_language)) - if (std::optional LC = LV->getAsUnsignedConstant()) - if ((DefaultLB = - LanguageLowerBound(static_cast(*LC)))) - if (LB && *LB == *DefaultLB) - LB = std::nullopt; + if (std::optional LV = D.getLanguage()) + if ((DefaultLB = + LanguageLowerBound(static_cast(*LV)))) + if (LB && *LB == *DefaultLB) + LB = std::nullopt; if (!LB && !Count && !UB) OS << "[]"; else if (!LB && (Count || UB) && DefaultLB) @@ -150,6 +149,16 @@ template DieType resolveReferencedType(DieType D, typename DieType::DWARFFormValue F) { return D.resolveReferencedType(F); } +template +const char *toString(std::optional F) { + if (F) { + llvm::Expected E = F->getAsCString(); + if (E) + return *E; + llvm::consumeError(E.takeError()); + } + return nullptr; +} } // namespace detail template @@ -239,7 +248,7 @@ DieType DWARFTypePrinter::appendUnqualifiedNameBefore( appendConstVolatileQualifierBefore(D); break; case dwarf::DW_TAG_namespace: { - if (const char *Name = dwarf::toString(D.find(dwarf::DW_AT_name), nullptr)) + if (const char *Name = detail::toString(D.find(dwarf::DW_AT_name))) OS << Name; else OS << "(anonymous namespace)"; @@ -261,7 +270,7 @@ DieType DWARFTypePrinter::appendUnqualifiedNameBefore( case DW_TAG_base_type: */ default: { - const char *NamePtr = dwarf::toString(D.find(dwarf::DW_AT_name), nullptr); + const char *NamePtr = detail::toString(D.find(dwarf::DW_AT_name)); if (!NamePtr) { appendTypeTagName(D.getTag()); return DieType(); @@ -440,7 +449,7 @@ bool DWARFTypePrinter::appendTemplateParameters(DieType D, if (T.getTag() == dwarf::DW_TAG_pointer_type || T.getTag() == dwarf::DW_TAG_reference_type) continue; - const char *RawName = dwarf::toString(T.find(dwarf::DW_AT_name), nullptr); + const char *RawName = detail::toString(T.find(dwarf::DW_AT_name)); assert(RawName); StringRef Name = RawName; auto V = C.find(dwarf::DW_AT_const_value); @@ -533,7 +542,7 @@ bool DWARFTypePrinter::appendTemplateParameters(DieType D, } if (C.getTag() == dwarf::DW_TAG_GNU_template_template_param) { const char *RawName = - dwarf::toString(C.find(dwarf::DW_AT_GNU_template_name), nullptr); + detail::toString(C.find(dwarf::DW_AT_GNU_template_name)); assert(RawName); StringRef Name = RawName; Sep(); @@ -593,7 +602,7 @@ void DWARFTypePrinter::appendConstVolatileQualifierAfter(DieType N) { decomposeConstVolatile(N, T, C, V); if (T && T.getTag() == dwarf::DW_TAG_subroutine_type) appendSubroutineNameAfter(T, detail::resolveReferencedType(T), false, - C.isValid(), V.isValid()); + static_cast(C), static_cast(V)); else appendUnqualifiedNameAfter(T, detail::resolveReferencedType(T)); } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index dcce484a7a37e..a0ce7810f91b0 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -413,6 +413,15 @@ bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const { return false; } +std::optional DWARFDie::getLanguage() const { + if (isValid()) { + if (std::optional LV = + U->getUnitDIE().find(dwarf::DW_AT_language)) + return LV->getAsUnsignedConstant(); + } + return std::nullopt; +} + Expected DWARFDie::getLocations(dwarf::Attribute Attr) const { std::optional Location = find(Attr); From b35f40688e3079d888932e0a35caa0b02d90db97 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Mon, 18 Nov 2024 15:16:48 -0800 Subject: [PATCH 053/366] [MemProf] Change the STACK_ID record to fixed width values (#116448) The stack ids are hashes that are close to 64 bits in size, so emitting as a pair of 32-bit fixed-width values is more efficient than a VBR. This reduced the summary bitcode size for a large target by about 1%. Bump the index version and ensure we can read the old format. --- llvm/include/llvm/IR/ModuleSummaryIndex.h | 2 +- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 11 ++++++- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 27 +++++++++++++----- llvm/test/Bitcode/summary_version.ll | 2 +- .../X86/Inputs/memprof-old-stackid-summary.bc | Bin 0 -> 2128 bytes .../X86/memprof-old-stackid-summary.ll | 20 +++++++++++++ 6 files changed, 52 insertions(+), 10 deletions(-) create mode 100644 llvm/test/ThinLTO/X86/Inputs/memprof-old-stackid-summary.bc create mode 100644 llvm/test/ThinLTO/X86/memprof-old-stackid-summary.ll diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 50def0eaf7886..39c60229aa1d8 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -1463,7 +1463,7 @@ class ModuleSummaryIndex { // in the way some record are interpreted, like flags for instance. // Note that incrementing this may require changes in both BitcodeReader.cpp // and BitcodeWriter.cpp. - static constexpr uint64_t BitcodeSummaryVersion = 11; + static constexpr uint64_t BitcodeSummaryVersion = 12; // Regular LTO module name for ASM writer static constexpr const char *getRegularLTOModuleName() { diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 9ca76b54a88d9..3e6abacac2726 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -7997,7 +7997,16 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { case bitc::FS_STACK_IDS: { // [n x stackid] // Save stack ids in the reader to consult when adding stack ids from the // lists in the stack node and alloc node entries. - StackIds = ArrayRef(Record); + if (Version <= 11) { + StackIds = ArrayRef(Record); + break; + } + // This is an array of 32-bit fixed-width values, holding each 64-bit + // context id as a pair of adjacent (most significant first) 32-bit words. + assert(Record.size() % 2 == 0); + StackIds.reserve(Record.size() / 2); + for (auto R = Record.begin(); R != Record.end(); R += 2) + StackIds.push_back(*R << 32 | *(R + 1)); break; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 5829af39cf5e2..24a4c2e8303d5 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -4429,12 +4429,17 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS)); // numids x stackid StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); - // FIXME: The stack ids are hashes that are close to 64 bits in size, so - // emitting as a pair of 32-bit fixed-width values, as we do for context - // ids, would be more efficient. - StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + // The stack ids are hashes that are close to 64 bits in size, so emitting + // as a pair of 32-bit fixed-width values is more efficient than a VBR. + StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv)); - Stream.EmitRecord(bitc::FS_STACK_IDS, Index->stackIds(), StackIdAbbvId); + SmallVector Vals; + Vals.reserve(Index->stackIds().size() * 2); + for (auto Id : Index->stackIds()) { + Vals.push_back(static_cast(Id >> 32)); + Vals.push_back(static_cast(Id)); + } + Stream.EmitRecord(bitc::FS_STACK_IDS, Vals, StackIdAbbvId); } // n x context id @@ -4624,9 +4629,17 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS)); // numids x stackid StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); - StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + // The stack ids are hashes that are close to 64 bits in size, so emitting + // as a pair of 32-bit fixed-width values is more efficient than a VBR. + StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv)); - Stream.EmitRecord(bitc::FS_STACK_IDS, StackIds, StackIdAbbvId); + SmallVector Vals; + Vals.reserve(StackIds.size() * 2); + for (auto Id : StackIds) { + Vals.push_back(static_cast(Id >> 32)); + Vals.push_back(static_cast(Id)); + } + Stream.EmitRecord(bitc::FS_STACK_IDS, Vals, StackIdAbbvId); } // Abbrev for FS_COMBINED_PROFILE. diff --git a/llvm/test/Bitcode/summary_version.ll b/llvm/test/Bitcode/summary_version.ll index c8d36f812c208..c95c145a08788 100644 --- a/llvm/test/Bitcode/summary_version.ll +++ b/llvm/test/Bitcode/summary_version.ll @@ -2,7 +2,7 @@ ; RUN: opt -module-summary %s -o - | llvm-bcanalyzer -dump | FileCheck %s ; CHECK: +; CHECK: diff --git a/llvm/test/ThinLTO/X86/Inputs/memprof-old-stackid-summary.bc b/llvm/test/ThinLTO/X86/Inputs/memprof-old-stackid-summary.bc new file mode 100644 index 0000000000000000000000000000000000000000..78b134cb44216e9dd72e1c140830d50fd83825bc GIT binary patch literal 2128 zcmX|CeM}q46(6(UHQ?+n33su>db8~Vd^)1@B^QaRjbiiRgz{yK z0Yk2>ja{vySSOQOI&@7Xb4jmh`wzL}k5o-7U@7$xU+Sc%3MmNL&IDrd!$QT7v+AVgY;_ypG+pBn$5KriEWU(yS)*kQ_W!k133VDsd|=+`C@DT7E*9B^BO2`?RZWw+U{f>edKZBY2l#hpG0U?eOEPS{ ziI246BLV4>S2DUJzgHTYm5e5QDr@G&J z=>5;bmG}Q5uMNGwBK-0nogZEKlsT;mgLhvy9*MtpQ^Bw22&o&}!j1Q{2nAoEK&*i- zFq2w+SxY=teB`yoLb0tlhMac~t{Mk)A2r&F!N~#bCL%wlh}|GjueCGtbB23Uw1+RsZwptXF+P3Fz;qO{AJ6Xuq7j!XE5uXur-2N z(mUnDF8tC(yy2qc0~(aPamnk#himZRXGQSak)U*5BMs;z-=s7;8G7Z(4)G*Q{A-rj z=CQWa6umx0r}K2hO)qiuLddY>R{u3Fnp2{AGEZ;Iri5sUVwQybZBY8A)g@fpH5;uPOLSOIhNQ4FW4Hx${aJ_#5~~Wl@PO% zrPp(YzpEIJ?k&@_c;Z^vl6F=ZPE^~jVECGMOOVdORPJIXMEM)w%zo6$grF{xXl?hIl}^HfCz4J z-|(I48u3c{yN`&cJekW9j~mIyNzfvzutBIbopgQ@ zQbj@7pMg+@B2DJ34A9viQt8Hpq+w~wkZodC=V(uLJ5S`4_>%CG4PNBJLpprWA)QZ3 z?*!rOt%S-NldTEalo3xrI;8j{l1)g}yl25bhr%rNL`4zE| zRd2FnE(6Y9ZwwD##78xf{`8^SvIX++hGe|gI$-Ze$)P~!MgTR%g-tfL$xCG zE1&8r=&@(VtBaf@LT)8B)Rm83^}73bQD---nAy4IEEfZ-gz^pa3yjGHiun zjLYz3AUBH;JRj&lVUtzi%KLF3GQLc~U0tOvMB^@&s-UpE{CjGbwglFdHpL=z2>A8F zyG|8bU8oP)lkxkq=IqYp-<_hDwm;qa=9*Mo{y_;<*&NlBRW`@sHxEQoRHZ$lDXXg9 zK%8*Wk%o6Ds^d6PGr?8=qKLDfbU5|_N$smVc@rxLVhF6K6qC>Z=)-^u1XUS9?3julybVb$* zU1W#1 z+fwFjZl9;WU*M0zccIJAnvZpMws{(aI=A348SD9DbwWcu?>S!Aahz|cGu0X0Mw7AA z%?C`z^TuPfLw((S{Ud#~Jzaf6f!YgwL%p57USGebcF^x`@9yfT^>hU~JC64BD7yX+ DUEs5- literal 0 HcmV?d00001 diff --git a/llvm/test/ThinLTO/X86/memprof-old-stackid-summary.ll b/llvm/test/ThinLTO/X86/memprof-old-stackid-summary.ll new file mode 100644 index 0000000000000..10048f8674a08 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-old-stackid-summary.ll @@ -0,0 +1,20 @@ +;; Check that we can read the old STACK_ID summary format that encoded the id as +;; a VBR8 instead of as a pair of 32-bit fixed-width values. +;; +;; The old bitcode was generated by the older compiler from `opt -thinlto-bc` +;; on the following LLVM assembly: +;; +;; target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +;; target triple = "x86_64-unknown-linux-gnu" +;; +;; define void @bar() { +;; call void @foo(), !callsite !0 +;; ret void +;; } +;; +;; declare void @foo() +;; +;; !0 = !{i64 9086428284934609951} + +; RUN: llvm-dis %S/Inputs/memprof-old-stackid-summary.bc -o - | FileCheck %s +; CHECK: stackIds: (9086428284934609951) From 5d33010f5edee8030d7b7d78c6e6f6992b659082 Mon Sep 17 00:00:00 2001 From: David Truby Date: Mon, 18 Nov 2024 23:25:45 +0000 Subject: [PATCH 054/366] [NFC][flang] Fix driver linker tests on Windows (#116667) --- flang/test/Driver/dynamic-linker.f90 | 2 +- flang/test/Driver/isysroot.f90 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/test/Driver/dynamic-linker.f90 b/flang/test/Driver/dynamic-linker.f90 index 6d5c443ab75cb..e850939374568 100644 --- a/flang/test/Driver/dynamic-linker.f90 +++ b/flang/test/Driver/dynamic-linker.f90 @@ -17,7 +17,7 @@ ! GNU-LINKER-OPTIONS-SAME: "-static" ! GNU-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir" -! RDYNAMIC-LINKER-OPTION: "{{.*}}ld" +! RDYNAMIC-LINKER-OPTION: "{{.*}}ld{{(\.lld)?(\.exe)?}}" ! RDYNAMIC-LINKER-OPTION-SAME: "-export-dynamic" ! For MSVC, adding -static does not add any additional linker options. diff --git a/flang/test/Driver/isysroot.f90 b/flang/test/Driver/isysroot.f90 index 28b435cce08ed..07ffb68653147 100644 --- a/flang/test/Driver/isysroot.f90 +++ b/flang/test/Driver/isysroot.f90 @@ -8,7 +8,7 @@ ! RUN: %flang -### --target=aarch64-linux-gnu -isysroot /path/to/sysroot \ ! RUN: %s 2>&1 | FileCheck %s --check-prefix=CHECK-LINUX -! CHECK-DARWIN: "{{.*}}ld{{(64)?(\.lld)?}}" {{.*}}"-syslibroot" "/path/to/sysroot" +! CHECK-DARWIN: "{{.*}}ld{{(64)?(\.lld)?(\.exe)?}}" {{.*}}"-syslibroot" "/path/to/sysroot" ! Unused on Linux. ! CHECK-LINUX: warning: argument unused during compilation: '-isysroot /path/to/sysroot' ! CHECK-LINUX-NOT: /path/to/sysroot From 9c3665c8d26ba041a6e582e83cc2de0a1f63be48 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Mon, 18 Nov 2024 15:29:49 -0800 Subject: [PATCH 055/366] [rtsan] Add I/O multiplexing interceptors (#115227) Intercepts in the family of `poll`, `select` and modern equivalents `epoll` (linux only) and `kqueue` bsd family only. These calls mirror the names of the system calls they call, which have been verified on mac at least (e.g. kevent calls the system call kevent). --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 111 ++++++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 168 ++++++++++++++++++ .../sanitizer_platform_interceptors.h | 2 + 3 files changed, 281 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 3a1b1f6524745..497db4ecc6ef4 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -42,6 +42,7 @@ void OSSpinLockLock(volatile OSSpinLock *__lock); #endif #include +#include #include #include #include @@ -612,6 +613,104 @@ INTERCEPTOR(int, shutdown, int socket, int how) { return REAL(shutdown)(socket, how); } +// I/O Multiplexing + +INTERCEPTOR(int, poll, struct pollfd *fds, nfds_t nfds, int timeout) { + __rtsan_notify_intercepted_call("poll"); + return REAL(poll)(fds, nfds, timeout); +} + +#if !SANITIZER_APPLE +// FIXME: This should work on all unix systems, even Mac, but currently +// it is showing some weird error while linking +// error: declaration of 'select' has a different language linkage +INTERCEPTOR(int, select, int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout) { + __rtsan_notify_intercepted_call("select"); + return REAL(select)(nfds, readfds, writefds, exceptfds, timeout); +} +#define RTSAN_MAYBE_INTERCEPT_SELECT INTERCEPT_FUNCTION(select) +#else +#define RTSAN_MAYBE_INTERCEPT_SELECT +#endif // !SANITIZER_APPLE + +INTERCEPTOR(int, pselect, int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, const struct timespec *timeout, + const sigset_t *sigmask) { + __rtsan_notify_intercepted_call("pselect"); + return REAL(pselect)(nfds, readfds, writefds, exceptfds, timeout, sigmask); +} + +#if SANITIZER_INTERCEPT_EPOLL +INTERCEPTOR(int, epoll_create, int size) { + __rtsan_notify_intercepted_call("epoll_create"); + return REAL(epoll_create)(size); +} + +INTERCEPTOR(int, epoll_create1, int flags) { + __rtsan_notify_intercepted_call("epoll_create1"); + return REAL(epoll_create1)(flags); +} + +INTERCEPTOR(int, epoll_ctl, int epfd, int op, int fd, + struct epoll_event *event) { + __rtsan_notify_intercepted_call("epoll_ctl"); + return REAL(epoll_ctl)(epfd, op, fd, event); +} + +INTERCEPTOR(int, epoll_wait, int epfd, struct epoll_event *events, + int maxevents, int timeout) { + __rtsan_notify_intercepted_call("epoll_wait"); + return REAL(epoll_wait)(epfd, events, maxevents, timeout); +} + +INTERCEPTOR(int, epoll_pwait, int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask) { + __rtsan_notify_intercepted_call("epoll_pwait"); + return REAL(epoll_pwait)(epfd, events, maxevents, timeout, sigmask); +} +#define RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE INTERCEPT_FUNCTION(epoll_create) +#define RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE1 INTERCEPT_FUNCTION(epoll_create1) +#define RTSAN_MAYBE_INTERCEPT_EPOLL_CTL INTERCEPT_FUNCTION(epoll_ctl) +#define RTSAN_MAYBE_INTERCEPT_EPOLL_WAIT INTERCEPT_FUNCTION(epoll_wait) +#define RTSAN_MAYBE_INTERCEPT_EPOLL_PWAIT INTERCEPT_FUNCTION(epoll_pwait) +#else +#define RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE +#define RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE1 +#define RTSAN_MAYBE_INTERCEPT_EPOLL_CTL +#define RTSAN_MAYBE_INTERCEPT_EPOLL_WAIT +#define RTSAN_MAYBE_INTERCEPT_EPOLL_PWAIT +#endif // SANITIZER_INTERCEPT_EPOLL + +#if SANITIZER_INTERCEPT_KQUEUE +INTERCEPTOR(int, kqueue, void) { + __rtsan_notify_intercepted_call("kqueue"); + return REAL(kqueue)(); +} + +INTERCEPTOR(int, kevent, int kq, const struct kevent *changelist, int nchanges, + struct kevent *eventlist, int nevents, + const struct timespec *timeout) { + __rtsan_notify_intercepted_call("kevent"); + return REAL(kevent)(kq, changelist, nchanges, eventlist, nevents, timeout); +} + +INTERCEPTOR(int, kevent64, int kq, const struct kevent64_s *changelist, + int nchanges, struct kevent64_s *eventlist, int nevents, + unsigned int flags, const struct timespec *timeout) { + __rtsan_notify_intercepted_call("kevent64"); + return REAL(kevent64)(kq, changelist, nchanges, eventlist, nevents, flags, + timeout); +} +#define RTSAN_MAYBE_INTERCEPT_KQUEUE INTERCEPT_FUNCTION(kqueue) +#define RTSAN_MAYBE_INTERCEPT_KEVENT INTERCEPT_FUNCTION(kevent) +#define RTSAN_MAYBE_INTERCEPT_KEVENT64 INTERCEPT_FUNCTION(kevent64) +#else +#define RTSAN_MAYBE_INTERCEPT_KQUEUE +#define RTSAN_MAYBE_INTERCEPT_KEVENT +#define RTSAN_MAYBE_INTERCEPT_KEVENT64 +#endif // SANITIZER_INTERCEPT_KQUEUE + // Preinit void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(calloc); @@ -696,6 +795,18 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(sendto); INTERCEPT_FUNCTION(shutdown); INTERCEPT_FUNCTION(socket); + + RTSAN_MAYBE_INTERCEPT_SELECT; + INTERCEPT_FUNCTION(pselect); + INTERCEPT_FUNCTION(poll); + RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE; + RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE1; + RTSAN_MAYBE_INTERCEPT_EPOLL_CTL; + RTSAN_MAYBE_INTERCEPT_EPOLL_WAIT; + RTSAN_MAYBE_INTERCEPT_EPOLL_PWAIT; + RTSAN_MAYBE_INTERCEPT_KQUEUE; + RTSAN_MAYBE_INTERCEPT_KEVENT; + RTSAN_MAYBE_INTERCEPT_KEVENT64; } #endif // SANITIZER_POSIX diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index d0ae12c9bea44..5be62b9790638 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -28,8 +28,18 @@ #include #endif +#if SANITIZER_INTERCEPT_EPOLL +#include +#endif + +#if SANITIZER_INTERCEPT_KQUEUE +#include +#include +#endif + #include #include +#include #include #include #include @@ -779,4 +789,162 @@ TEST(TestRtsanInterceptors, ShutdownOnASocketDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +/* + I/O Multiplexing +*/ + +TEST(TestRtsanInterceptors, PollDiesWhenRealtime) { + struct pollfd fds[1]; + fds[0].fd = 0; + fds[0].events = POLLIN; + + auto Func = [&fds]() { poll(fds, 1, 0); }; + + ExpectRealtimeDeath(Func, "poll"); + ExpectNonRealtimeSurvival(Func); +} + +#if !SANITIZER_APPLE +// FIXME: This should work on Darwin as well +// see the comment near the interceptor +TEST(TestRtsanInterceptors, SelectDiesWhenRealtime) { + fd_set readfds; + FD_ZERO(&readfds); + FD_SET(0, &readfds); + struct timeval timeout = {0, 0}; + + auto Func = [&readfds, &timeout]() { + select(1, &readfds, nullptr, nullptr, &timeout); + }; + ExpectRealtimeDeath(Func, "select"); + ExpectNonRealtimeSurvival(Func); +} +#endif + +TEST(TestRtsanInterceptors, PSelectDiesWhenRealtime) { + fd_set readfds; + FD_ZERO(&readfds); + FD_SET(0, &readfds); + struct timespec timeout = {0, 0}; + + auto Func = [&]() { + pselect(1, &readfds, nullptr, nullptr, &timeout, nullptr); + }; + ExpectRealtimeDeath(Func, "pselect"); + ExpectNonRealtimeSurvival(Func); +} + +#if SANITIZER_INTERCEPT_EPOLL +TEST(TestRtsanInterceptors, EpollCreateDiesWhenRealtime) { + auto Func = []() { epoll_create(1); }; + ExpectRealtimeDeath(Func, "epoll_create"); + ExpectNonRealtimeSurvival(Func); +} + +TEST(TestRtsanInterceptors, EpollCreate1DiesWhenRealtime) { + auto Func = []() { epoll_create1(EPOLL_CLOEXEC); }; + ExpectRealtimeDeath(Func, "epoll_create1"); + ExpectNonRealtimeSurvival(Func); +} + +class EpollTest : public ::testing::Test { +protected: + void SetUp() override { + epfd = epoll_create1(EPOLL_CLOEXEC); + ASSERT_GE(epfd, 0); + } + + void TearDown() override { + if (epfd >= 0) + close(epfd); + } + + int GetEpollFd() { return epfd; } + +private: + int epfd = -1; +}; + +TEST_F(EpollTest, EpollCtlDiesWhenRealtime) { + auto Func = [this]() { + struct epoll_event event = {.events = EPOLLIN, .data = {.fd = 0}}; + epoll_ctl(GetEpollFd(), EPOLL_CTL_ADD, 0, &event); + }; + ExpectRealtimeDeath(Func, "epoll_ctl"); + ExpectNonRealtimeSurvival(Func); +} + +TEST_F(EpollTest, EpollWaitDiesWhenRealtime) { + auto Func = [this]() { + struct epoll_event events[1]; + epoll_wait(GetEpollFd(), events, 1, 0); + }; + + ExpectRealtimeDeath(Func, "epoll_wait"); + ExpectNonRealtimeSurvival(Func); +} + +TEST_F(EpollTest, EpollPWaitDiesWhenRealtime) { + auto Func = [this]() { + struct epoll_event events[1]; + epoll_pwait(GetEpollFd(), events, 1, 0, nullptr); + }; + + ExpectRealtimeDeath(Func, "epoll_pwait"); + ExpectNonRealtimeSurvival(Func); +} +#endif // SANITIZER_INTERCEPT_EPOLL + +#if SANITIZER_INTERCEPT_KQUEUE +TEST(TestRtsanInterceptors, KqueueDiesWhenRealtime) { + auto Func = []() { kqueue(); }; + ExpectRealtimeDeath(Func, "kqueue"); + ExpectNonRealtimeSurvival(Func); +} + +class KqueueTest : public ::testing::Test { +protected: + void SetUp() override { + kq = kqueue(); + ASSERT_GE(kq, 0); + } + + void TearDown() override { + if (kq >= 0) + close(kq); + } + + int GetKqueueFd() { return kq; } + +private: + int kq = -1; +}; + +TEST_F(KqueueTest, KeventDiesWhenRealtime) { + struct kevent event; + EV_SET(&event, 0, EVFILT_READ, EV_ADD, 0, 0, nullptr); + struct timespec timeout = {0, 0}; + + auto Func = [this, event, timeout]() { + kevent(GetKqueueFd(), &event, 1, nullptr, 0, &timeout); + }; + + ExpectRealtimeDeath(Func, "kevent"); + ExpectNonRealtimeSurvival(Func); +} + +TEST_F(KqueueTest, Kevent64DiesWhenRealtime) { + struct kevent64_s event; + EV_SET64(&event, 0, EVFILT_READ, EV_ADD, 0, 0, 0, 0, 0); + struct timespec timeout = {0, 0}; + + auto Func = [this, event, timeout]() { + kevent64(GetKqueueFd(), &event, 1, nullptr, 0, 0, &timeout); + }; + + ExpectRealtimeDeath(Func, "kevent64"); + ExpectNonRealtimeSurvival(Func); +} +#endif // SANITIZER_INTERCEPT_KQUEUE + #endif // SANITIZER_POSIX diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index 3fd6b595ef197..7f9d4998bf757 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -339,6 +339,8 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_GETGROUPS SI_POSIX #define SANITIZER_INTERCEPT_POLL SI_POSIX #define SANITIZER_INTERCEPT_PPOLL SI_LINUX_NOT_ANDROID || SI_SOLARIS +#define SANITIZER_INTERCEPT_EPOLL (SI_LINUX) +#define SANITIZER_INTERCEPT_KQUEUE (SI_FREEBSD || SI_NETBSD || SI_MAC) #define SANITIZER_INTERCEPT_WORDEXP \ (SI_FREEBSD || SI_NETBSD || (SI_MAC && !SI_IOS) || SI_LINUX_NOT_ANDROID || \ SI_SOLARIS) From dc087d1a338ca07b77c28522abb063e712b3877d Mon Sep 17 00:00:00 2001 From: Tom Honermann Date: Mon, 18 Nov 2024 18:32:20 -0500 Subject: [PATCH 056/366] Avoid undefined behavior in shift operators during constant folding of DIExpressions. (#116466) Bit shift operations with a shift operand greater than or equal to the bit width of the (promoted) value type result in undefined behavior according to C++ [expr.shift]p1. This change adds checking for this situation and avoids attempts to constant fold DIExpressions that would otherwise provoke such behavior. An existing test that presumably intended to exercise shifts at the UB boundary has been updated; it now checks for shifts of 64 bits instead of 65. This issue was reported by a static analysis tool; no actual cases of shift operations that would result in undefined behavior in practice have been identified. --- llvm/lib/IR/DIExpressionOptimizer.cpp | 6 ++++-- llvm/unittests/IR/MetadataTest.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/lib/IR/DIExpressionOptimizer.cpp b/llvm/lib/IR/DIExpressionOptimizer.cpp index 2bb8eac348c8e..be9e13a34235a 100644 --- a/llvm/lib/IR/DIExpressionOptimizer.cpp +++ b/llvm/lib/IR/DIExpressionOptimizer.cpp @@ -59,12 +59,14 @@ foldOperationIfPossible(uint64_t Const1, uint64_t Const2, return Const1 - Const2; } case dwarf::DW_OP_shl: { - if ((uint64_t)countl_zero(Const1) < Const2) + if (Const2 >= std::numeric_limits::digits || + static_cast(countl_zero(Const1)) < Const2) return std::nullopt; return Const1 << Const2; } case dwarf::DW_OP_shr: { - if ((uint64_t)countr_zero(Const1) < Const2) + if (Const2 >= std::numeric_limits::digits || + static_cast(countr_zero(Const1)) < Const2) return std::nullopt; return Const1 >> Const2; } diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp index fbdab1975df72..628221339c89b 100644 --- a/llvm/unittests/IR/MetadataTest.cpp +++ b/llvm/unittests/IR/MetadataTest.cpp @@ -3541,12 +3541,12 @@ TEST_F(DIExpressionTest, Fold) { ResExpr = DIExpression::get(Context, ResOps); EXPECT_EQ(E, ResExpr); - // Test a left shift greater than 64. + // Test a left shift greater than 63. Ops.clear(); Ops.push_back(dwarf::DW_OP_constu); Ops.push_back(1); Ops.push_back(dwarf::DW_OP_constu); - Ops.push_back(65); + Ops.push_back(64); Ops.push_back(dwarf::DW_OP_shl); Expr = DIExpression::get(Context, Ops); E = Expr->foldConstantMath(); @@ -3554,17 +3554,17 @@ TEST_F(DIExpressionTest, Fold) { ResOps.push_back(dwarf::DW_OP_constu); ResOps.push_back(1); ResOps.push_back(dwarf::DW_OP_constu); - ResOps.push_back(65); + ResOps.push_back(64); ResOps.push_back(dwarf::DW_OP_shl); ResExpr = DIExpression::get(Context, ResOps); EXPECT_EQ(E, ResExpr); - // Test a right shift greater than 64. + // Test a right shift greater than 63. Ops.clear(); Ops.push_back(dwarf::DW_OP_constu); Ops.push_back(1); Ops.push_back(dwarf::DW_OP_constu); - Ops.push_back(65); + Ops.push_back(64); Ops.push_back(dwarf::DW_OP_shr); Expr = DIExpression::get(Context, Ops); E = Expr->foldConstantMath(); @@ -3572,7 +3572,7 @@ TEST_F(DIExpressionTest, Fold) { ResOps.push_back(dwarf::DW_OP_constu); ResOps.push_back(1); ResOps.push_back(dwarf::DW_OP_constu); - ResOps.push_back(65); + ResOps.push_back(64); ResOps.push_back(dwarf::DW_OP_shr); ResExpr = DIExpression::get(Context, ResOps); EXPECT_EQ(E, ResExpr); From 2310e3e3f2ccdab156abc7f9d186b2605027d8fe Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Mon, 18 Nov 2024 15:39:28 -0800 Subject: [PATCH 057/366] [GlobalISel] Move DemandedElt's APInt size assert after isValid() check (#115979) This prevents the assertion from wrongly triggering on invalid LLT's --- llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp index 40d4a5250dfbb..a700d866afa4e 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp @@ -147,6 +147,15 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, unsigned Opcode = MI.getOpcode(); LLT DstTy = MRI.getType(R); + // Handle the case where this is called on a register that does not have a + // type constraint (i.e. it has a register class constraint instead). This is + // unlikely to occur except by looking through copies but it is possible for + // the initial register being queried to be in this state. + if (!DstTy.isValid()) { + Known = KnownBits(); + return; + } + #ifndef NDEBUG if (DstTy.isFixedVector()) { assert( @@ -158,15 +167,6 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, } #endif - // Handle the case where this is called on a register that does not have a - // type constraint (i.e. it has a register class constraint instead). This is - // unlikely to occur except by looking through copies but it is possible for - // the initial register being queried to be in this state. - if (!DstTy.isValid()) { - Known = KnownBits(); - return; - } - unsigned BitWidth = DstTy.getScalarSizeInBits(); auto CacheEntry = ComputeKnownBitsCache.find(R); if (CacheEntry != ComputeKnownBitsCache.end()) { From 2de78815604e9027efd93cac27c517bf732587d2 Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Mon, 18 Nov 2024 15:48:53 -0800 Subject: [PATCH 058/366] [NFC] Move DroppedVariableStats to its own file and redesign it to be extensible. (#115563) Move DroppedVariableStats code to its own file and change the class to have an extensible design so that we can use it to add dropped statistics to MIR passes and the instruction selector. --- .../llvm/CodeGen/DroppedVariableStats.h | 226 ++++++++++++++++++ .../llvm/Passes/StandardInstrumentations.h | 80 +------ llvm/lib/CodeGen/CMakeLists.txt | 1 + llvm/lib/CodeGen/DroppedVariableStats.cpp | 194 +++++++++++++++ llvm/lib/Passes/StandardInstrumentations.cpp | 178 +------------- llvm/unittests/IR/CMakeLists.txt | 2 +- ...est.cpp => DroppedVariableStatsIRTest.cpp} | 73 +++--- 7 files changed, 455 insertions(+), 299 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/DroppedVariableStats.h create mode 100644 llvm/lib/CodeGen/DroppedVariableStats.cpp rename llvm/unittests/IR/{DroppedVariableStatsTest.cpp => DroppedVariableStatsIRTest.cpp} (91%) diff --git a/llvm/include/llvm/CodeGen/DroppedVariableStats.h b/llvm/include/llvm/CodeGen/DroppedVariableStats.h new file mode 100644 index 0000000000000..371d775b02e87 --- /dev/null +++ b/llvm/include/llvm/CodeGen/DroppedVariableStats.h @@ -0,0 +1,226 @@ +///===- DroppedVariableStats.h - Opt Diagnostics -*- C++ -*----------------===// +/// +/// Part of the LLVM Project, under the Apache License v2.0 with LLVM +/// Exceptions. See https://llvm.org/LICENSE.txt for license information. +/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +/// +///===---------------------------------------------------------------------===// +/// \file +/// Dropped Variable Statistics for Debug Information. Reports any number +/// of #dbg_value that get dropped due to an optimization pass. +/// +///===---------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_DROPPEDVARIABLESTATS_H +#define LLVM_CODEGEN_DROPPEDVARIABLESTATS_H + +#include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassInstrumentation.h" + +namespace llvm { + +/// A unique key that represents a debug variable. +/// First const DIScope *: Represents the scope of the debug variable. +/// Second const DIScope *: Represents the InlinedAt scope of the debug +/// variable. const DILocalVariable *: It is a pointer to the debug variable +/// itself. +using VarID = + std::tuple; + +/// A base class to collect and print dropped debug information variable +/// statistics. +class DroppedVariableStats { +public: + DroppedVariableStats(bool DroppedVarStatsEnabled) + : DroppedVariableStatsEnabled(DroppedVarStatsEnabled) { + if (DroppedVarStatsEnabled) + llvm::outs() + << "Pass Level, Pass Name, Num of Dropped Variables, Func or " + "Module Name\n"; + }; + + virtual ~DroppedVariableStats() = default; + + // We intend this to be unique per-compilation, thus no copies. + DroppedVariableStats(const DroppedVariableStats &) = delete; + void operator=(const DroppedVariableStats &) = delete; + + bool getPassDroppedVariables() { return PassDroppedVariables; } + +protected: + void setup() { + DebugVariablesStack.push_back( + {DenseMap()}); + InlinedAts.push_back( + {DenseMap>()}); + } + + void cleanup() { + assert(!DebugVariablesStack.empty() && + "DebugVariablesStack shouldn't be empty!"); + assert(!InlinedAts.empty() && "InlinedAts shouldn't be empty!"); + DebugVariablesStack.pop_back(); + InlinedAts.pop_back(); + } + + bool DroppedVariableStatsEnabled = false; + struct DebugVariables { + /// DenseSet of VarIDs before an optimization pass has run. + DenseSet DebugVariablesBefore; + /// DenseSet of VarIDs after an optimization pass has run. + DenseSet DebugVariablesAfter; + }; + +protected: + /// A stack of a DenseMap, that maps DebugVariables for every pass to an + /// llvm::Function. A stack is used because an optimization pass can call + /// other passes. + SmallVector> DebugVariablesStack; + + /// A DenseSet tracking whether a scope was visited before. + DenseSet VisitedScope; + /// A stack of DenseMaps, which map the name of an llvm::Function to a + /// DenseMap of VarIDs and their inlinedAt locations before an optimization + /// pass has run. + SmallVector>> InlinedAts; + /// Calculate the number of dropped variables in an llvm::Function or + /// llvm::MachineFunction and print the relevant information to stdout. + void calculateDroppedStatsAndPrint(DebugVariables &DbgVariables, + StringRef FuncName, StringRef PassID, + StringRef FuncOrModName, + StringRef PassLevel, const Function *Func); + + /// Check if a \p Var has been dropped or is a false positive. Also update the + /// \p DroppedCount if a debug variable is dropped. + bool updateDroppedCount(DILocation *DbgLoc, const DIScope *Scope, + const DIScope *DbgValScope, + DenseMap &InlinedAtsMap, + VarID Var, unsigned &DroppedCount); + /// Run code to populate relevant data structures over an llvm::Function or + /// llvm::MachineFunction. + void run(DebugVariables &DbgVariables, StringRef FuncName, bool Before); + /// Populate the VarIDSet and InlinedAtMap with the relevant information + /// needed for before and after pass analysis to determine dropped variable + /// status. + void populateVarIDSetAndInlinedMap( + const DILocalVariable *DbgVar, DebugLoc DbgLoc, DenseSet &VarIDSet, + DenseMap> &InlinedAtsMap, + StringRef FuncName, bool Before); + /// Visit every llvm::Instruction or llvm::MachineInstruction and check if the + /// debug variable denoted by its ID \p Var may have been dropped by an + /// optimization pass. + virtual void + visitEveryInstruction(unsigned &DroppedCount, + DenseMap &InlinedAtsMap, + VarID Var) = 0; + /// Visit every debug record in an llvm::Function or llvm::MachineFunction + /// and call populateVarIDSetAndInlinedMap on it. + virtual void visitEveryDebugRecord( + DenseSet &VarIDSet, + DenseMap> &InlinedAtsMap, + StringRef FuncName, bool Before) = 0; + +private: + /// Remove a dropped debug variable's VarID from all Sets in the + /// DroppedVariablesBefore stack. + void removeVarFromAllSets(VarID Var, const Function *F) { + // Do not remove Var from the last element, it will be popped from the + // stack. + for (auto &DebugVariablesMap : llvm::drop_end(DebugVariablesStack)) + DebugVariablesMap[F].DebugVariablesBefore.erase(Var); + } + /// Return true if \p Scope is the same as \p DbgValScope or a child scope of + /// \p DbgValScope, return false otherwise. + bool isScopeChildOfOrEqualTo(const DIScope *Scope, + const DIScope *DbgValScope); + /// Return true if \p InlinedAt is the same as \p DbgValInlinedAt or part of + /// the InlinedAt chain, return false otherwise. + bool isInlinedAtChildOfOrEqualTo(const DILocation *InlinedAt, + const DILocation *DbgValInlinedAt); + bool PassDroppedVariables = false; +}; + +/// A class to collect and print dropped debug information due to LLVM IR +/// optimization passes. After every LLVM IR pass is run, it will print how many +/// #dbg_values were dropped due to that pass. +class DroppedVariableStatsIR : public DroppedVariableStats { +public: + DroppedVariableStatsIR(bool DroppedVarStatsEnabled) + : llvm::DroppedVariableStats(DroppedVarStatsEnabled) {} + + virtual ~DroppedVariableStatsIR() = default; + + void runBeforePass(Any IR) { + setup(); + if (const auto *M = unwrapIR(IR)) + return this->runOnModule(M, true); + if (const auto *F = unwrapIR(IR)) + return this->runOnFunction(F, true); + } + + void runAfterPass(StringRef P, Any IR) { + if (const auto *M = unwrapIR(IR)) + runAfterPassModule(P, M); + else if (const auto *F = unwrapIR(IR)) + runAfterPassFunction(P, F); + cleanup(); + } + + void registerCallbacks(PassInstrumentationCallbacks &PIC); + +private: + const Function *Func; + + void runAfterPassFunction(StringRef PassID, const Function *F) { + runOnFunction(F, false); + calculateDroppedVarStatsOnFunction(F, PassID, F->getName().str(), + "Function"); + } + + void runAfterPassModule(StringRef PassID, const Module *M) { + runOnModule(M, false); + calculateDroppedVarStatsOnModule(M, PassID, M->getName().str(), "Module"); + } + /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or + /// after a pass has run to facilitate dropped variable calculation for an + /// llvm::Function. + void runOnFunction(const Function *F, bool Before); + /// Iterate over all Instructions in a Function and report any dropped debug + /// information. + void calculateDroppedVarStatsOnFunction(const Function *F, StringRef PassID, + StringRef FuncOrModName, + StringRef PassLevel); + /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or + /// after a pass has run to facilitate dropped variable calculation for an + /// llvm::Module. Calls runOnFunction on every Function in the Module. + void runOnModule(const Module *M, bool Before); + /// Iterate over all Functions in a Module and report any dropped debug + /// information. Will call calculateDroppedVarStatsOnFunction on every + /// Function. + void calculateDroppedVarStatsOnModule(const Module *M, StringRef PassID, + StringRef FuncOrModName, + StringRef PassLevel); + /// Override base class method to run on an llvm::Function specifically. + virtual void + visitEveryInstruction(unsigned &DroppedCount, + DenseMap &InlinedAtsMap, + VarID Var) override; + /// Override base class method to run on #dbg_values specifically. + virtual void visitEveryDebugRecord( + DenseSet &VarIDSet, + DenseMap> &InlinedAtsMap, + StringRef FuncName, bool Before) override; + + template static const IRUnitT *unwrapIR(Any IR) { + const IRUnitT **IRPtr = llvm::any_cast(&IR); + return IRPtr ? *IRPtr : nullptr; + } +}; + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h index 9301a12c740ee..12a34c099eaff 100644 --- a/llvm/include/llvm/Passes/StandardInstrumentations.h +++ b/llvm/include/llvm/Passes/StandardInstrumentations.h @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" +#include "llvm/CodeGen/DroppedVariableStats.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -579,83 +580,6 @@ class PrintCrashIRInstrumentation { static void SignalHandler(void *); }; -/// A class to collect and print dropped debug information variable statistics. -/// After every LLVM IR pass is run, it will print how many #dbg_values were -/// dropped due to that pass. -class DroppedVariableStats { -public: - DroppedVariableStats(bool DroppedVarStatsEnabled) { - if (DroppedVarStatsEnabled) - llvm::outs() - << "Pass Level, Pass Name, Num of Dropped Variables, Func or " - "Module Name\n"; - }; - // We intend this to be unique per-compilation, thus no copies. - DroppedVariableStats(const DroppedVariableStats &) = delete; - void operator=(const DroppedVariableStats &) = delete; - - void registerCallbacks(PassInstrumentationCallbacks &PIC); - void runBeforePass(StringRef PassID, Any IR); - void runAfterPass(StringRef PassID, Any IR, const PreservedAnalyses &PA); - void runAfterPassInvalidated(StringRef PassID, const PreservedAnalyses &PA); - bool getPassDroppedVariables() { return PassDroppedVariables; } - -private: - bool PassDroppedVariables = false; - /// A unique key that represents a #dbg_value. - using VarID = - std::tuple; - - struct DebugVariables { - /// DenseSet of VarIDs before an optimization pass has run. - DenseSet DebugVariablesBefore; - /// DenseSet of VarIDs after an optimization pass has run. - DenseSet DebugVariablesAfter; - }; - - /// A stack of a DenseMap, that maps DebugVariables for every pass to an - /// llvm::Function. A stack is used because an optimization pass can call - /// other passes. - SmallVector> DebugVariablesStack; - - /// A DenseSet tracking whether a scope was visited before. - DenseSet VisitedScope; - /// A stack of DenseMaps, which map the name of an llvm::Function to a - /// DenseMap of VarIDs and their inlinedAt locations before an optimization - /// pass has run. - SmallVector>> InlinedAts; - - /// Iterate over all Functions in a Module and report any dropped debug - /// information. Will call calculateDroppedVarStatsOnFunction on every - /// Function. - void calculateDroppedVarStatsOnModule(const Module *M, StringRef PassID, - std::string FuncOrModName, - std::string PassLevel); - /// Iterate over all Instructions in a Function and report any dropped debug - /// information. - void calculateDroppedVarStatsOnFunction(const Function *F, StringRef PassID, - std::string FuncOrModName, - std::string PassLevel); - /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or - /// after a pass has run to facilitate dropped variable calculation for an - /// llvm::Function. - void runOnFunction(const Function *F, bool Before); - /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or - /// after a pass has run to facilitate dropped variable calculation for an - /// llvm::Module. Calls runOnFunction on every Function in the Module. - void runOnModule(const Module *M, bool Before); - /// Remove a dropped #dbg_value VarID from all Sets in the - /// DroppedVariablesBefore stack. - void removeVarFromAllSets(VarID Var, const Function *F); - /// Return true if \p Scope is the same as \p DbgValScope or a child scope of - /// \p DbgValScope, return false otherwise. - bool isScopeChildOfOrEqualTo(DIScope *Scope, const DIScope *DbgValScope); - /// Return true if \p InlinedAt is the same as \p DbgValInlinedAt or part of - /// the InlinedAt chain, return false otherwise. - bool isInlinedAtChildOfOrEqualTo(const DILocation *InlinedAt, - const DILocation *DbgValInlinedAt); -}; - /// This class provides an interface to register all the standard pass /// instrumentations and manages their state (if any). class StandardInstrumentations { @@ -673,7 +597,7 @@ class StandardInstrumentations { PrintCrashIRInstrumentation PrintCrashIR; IRChangedTester ChangeTester; VerifyInstrumentation Verify; - DroppedVariableStats DroppedStats; + DroppedVariableStatsIR DroppedStatsIR; bool VerifyEach; diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 7b47c0e6f75db..263d4a9ee94d2 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -50,6 +50,7 @@ add_llvm_component_library(LLVMCodeGen DeadMachineInstructionElim.cpp DetectDeadLanes.cpp DFAPacketizer.cpp + DroppedVariableStats.cpp DwarfEHPrepare.cpp EarlyIfConversion.cpp EdgeBundles.cpp diff --git a/llvm/lib/CodeGen/DroppedVariableStats.cpp b/llvm/lib/CodeGen/DroppedVariableStats.cpp new file mode 100644 index 0000000000000..122fcad1293f1 --- /dev/null +++ b/llvm/lib/CodeGen/DroppedVariableStats.cpp @@ -0,0 +1,194 @@ +///===- DroppedVariableStats.cpp ------------------------------------------===// +/// +/// Part of the LLVM Project, under the Apache License v2.0 with LLVM +/// Exceptions. See https://llvm.org/LICENSE.txt for license information. +/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +/// +///===---------------------------------------------------------------------===// +/// \file +/// Dropped Variable Statistics for Debug Information. Reports any number +/// of #dbg_value that get dropped due to an optimization pass. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/DroppedVariableStats.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +bool DroppedVariableStats::isScopeChildOfOrEqualTo(const DIScope *Scope, + const DIScope *DbgValScope) { + while (Scope != nullptr) { + if (VisitedScope.find(Scope) == VisitedScope.end()) { + VisitedScope.insert(Scope); + if (Scope == DbgValScope) { + VisitedScope.clear(); + return true; + } + Scope = Scope->getScope(); + } else { + VisitedScope.clear(); + return false; + } + } + return false; +} + +bool DroppedVariableStats::isInlinedAtChildOfOrEqualTo( + const DILocation *InlinedAt, const DILocation *DbgValInlinedAt) { + if (DbgValInlinedAt == InlinedAt) + return true; + if (!DbgValInlinedAt) + return false; + auto *IA = InlinedAt; + while (IA) { + if (IA == DbgValInlinedAt) + return true; + IA = IA->getInlinedAt(); + } + return false; +} + +void DroppedVariableStats::calculateDroppedStatsAndPrint( + DebugVariables &DbgVariables, StringRef FuncName, StringRef PassID, + StringRef FuncOrModName, StringRef PassLevel, const Function *Func) { + unsigned DroppedCount = 0; + DenseSet &DebugVariablesBeforeSet = DbgVariables.DebugVariablesBefore; + DenseSet &DebugVariablesAfterSet = DbgVariables.DebugVariablesAfter; + DenseMap &InlinedAtsMap = InlinedAts.back()[FuncName]; + // Find an Instruction that shares the same scope as the dropped #dbg_value or + // has a scope that is the child of the scope of the #dbg_value, and has an + // inlinedAt equal to the inlinedAt of the #dbg_value or it's inlinedAt chain + // contains the inlinedAt of the #dbg_value, if such an Instruction is found, + // debug information is dropped. + for (VarID Var : DebugVariablesBeforeSet) { + if (DebugVariablesAfterSet.contains(Var)) + continue; + visitEveryInstruction(DroppedCount, InlinedAtsMap, Var); + removeVarFromAllSets(Var, Func); + } + if (DroppedCount > 0) { + llvm::outs() << PassLevel << ", " << PassID << ", " << DroppedCount << ", " + << FuncOrModName << "\n"; + PassDroppedVariables = true; + } else + PassDroppedVariables = false; +} + +bool DroppedVariableStats::updateDroppedCount( + DILocation *DbgLoc, const DIScope *Scope, const DIScope *DbgValScope, + DenseMap &InlinedAtsMap, VarID Var, + unsigned &DroppedCount) { + + // If the Scope is a child of, or equal to the DbgValScope and is inlined at + // the Var's InlinedAt location, return true to signify that the Var has been + // dropped. + if (isScopeChildOfOrEqualTo(Scope, DbgValScope)) + if (isInlinedAtChildOfOrEqualTo(DbgLoc->getInlinedAt(), + InlinedAtsMap[Var])) { + // Found another instruction in the variable's scope, so there exists a + // break point at which the variable could be observed. Count it as + // dropped. + DroppedCount++; + return true; + } + return false; +} + +void DroppedVariableStats::run(DebugVariables &DbgVariables, StringRef FuncName, + bool Before) { + auto &VarIDSet = (Before ? DbgVariables.DebugVariablesBefore + : DbgVariables.DebugVariablesAfter); + auto &InlinedAtsMap = InlinedAts.back(); + if (Before) + InlinedAtsMap.try_emplace(FuncName, DenseMap()); + VarIDSet = DenseSet(); + visitEveryDebugRecord(VarIDSet, InlinedAtsMap, FuncName, Before); +} + +void DroppedVariableStats::populateVarIDSetAndInlinedMap( + const DILocalVariable *DbgVar, DebugLoc DbgLoc, DenseSet &VarIDSet, + DenseMap> &InlinedAtsMap, + StringRef FuncName, bool Before) { + VarID Key{DbgVar->getScope(), DbgLoc->getInlinedAtScope(), DbgVar}; + VarIDSet.insert(Key); + if (Before) + InlinedAtsMap[FuncName].try_emplace(Key, DbgLoc.getInlinedAt()); +} + +void DroppedVariableStatsIR::runOnFunction(const Function *F, bool Before) { + auto &DebugVariables = DebugVariablesStack.back()[F]; + auto FuncName = F->getName(); + Func = F; + run(DebugVariables, FuncName, Before); +} + +void DroppedVariableStatsIR::calculateDroppedVarStatsOnFunction( + const Function *F, StringRef PassID, StringRef FuncOrModName, + StringRef PassLevel) { + Func = F; + StringRef FuncName = F->getName(); + DebugVariables &DbgVariables = DebugVariablesStack.back()[F]; + calculateDroppedStatsAndPrint(DbgVariables, FuncName, PassID, FuncOrModName, + PassLevel, Func); +} + +void DroppedVariableStatsIR::runOnModule(const Module *M, bool Before) { + for (auto &F : *M) + runOnFunction(&F, Before); +} + +void DroppedVariableStatsIR::calculateDroppedVarStatsOnModule( + const Module *M, StringRef PassID, StringRef FuncOrModName, + StringRef PassLevel) { + for (auto &F : *M) { + calculateDroppedVarStatsOnFunction(&F, PassID, FuncOrModName, PassLevel); + } +} + +void DroppedVariableStatsIR::registerCallbacks( + PassInstrumentationCallbacks &PIC) { + if (!DroppedVariableStatsEnabled) + return; + + PIC.registerBeforeNonSkippedPassCallback( + [this](StringRef P, Any IR) { return runBeforePass(IR); }); + PIC.registerAfterPassCallback( + [this](StringRef P, Any IR, const PreservedAnalyses &PA) { + return runAfterPass(P, IR); + }); + PIC.registerAfterPassInvalidatedCallback( + [this](StringRef P, const PreservedAnalyses &PA) { return cleanup(); }); +} + +void DroppedVariableStatsIR::visitEveryInstruction( + unsigned &DroppedCount, DenseMap &InlinedAtsMap, + VarID Var) { + const DIScope *DbgValScope = std::get<0>(Var); + for (const auto &I : instructions(Func)) { + auto *DbgLoc = I.getDebugLoc().get(); + if (!DbgLoc) + continue; + if (updateDroppedCount(DbgLoc, DbgLoc->getScope(), DbgValScope, + InlinedAtsMap, Var, DroppedCount)) + break; + } +} + +void DroppedVariableStatsIR::visitEveryDebugRecord( + DenseSet &VarIDSet, + DenseMap> &InlinedAtsMap, + StringRef FuncName, bool Before) { + for (const auto &I : instructions(Func)) { + for (DbgRecord &DR : I.getDbgRecordRange()) { + if (auto *Dbg = dyn_cast(&DR)) { + auto *DbgVar = Dbg->getVariable(); + auto DbgLoc = DR.getDebugLoc(); + populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap, + FuncName, Before); + } + } + } +} diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 6259f8f736c80..b766517e68eba 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -2462,7 +2462,7 @@ StandardInstrumentations::StandardInstrumentations( PrintChanged == ChangePrinter::ColourDiffVerbose || PrintChanged == ChangePrinter::ColourDiffQuiet), WebsiteChangeReporter(PrintChanged == ChangePrinter::DotCfgVerbose), - Verify(DebugLogging), DroppedStats(DroppedVarStats), + Verify(DebugLogging), DroppedStatsIR(DroppedVarStats), VerifyEach(VerifyEach) {} PrintCrashIRInstrumentation *PrintCrashIRInstrumentation::CrashReporter = @@ -2523,180 +2523,6 @@ void PrintCrashIRInstrumentation::registerCallbacks( }); } -void DroppedVariableStats::registerCallbacks( - PassInstrumentationCallbacks &PIC) { - if (!DroppedVarStats) - return; - - PIC.registerBeforeNonSkippedPassCallback( - [this](StringRef P, Any IR) { return this->runBeforePass(P, IR); }); - PIC.registerAfterPassCallback( - [this](StringRef P, Any IR, const PreservedAnalyses &PA) { - return this->runAfterPass(P, IR, PA); - }); - PIC.registerAfterPassInvalidatedCallback( - [this](StringRef P, const PreservedAnalyses &PA) { - return this->runAfterPassInvalidated(P, PA); - }); -} - -void DroppedVariableStats::runBeforePass(StringRef PassID, Any IR) { - DebugVariablesStack.push_back({DenseMap()}); - InlinedAts.push_back({DenseMap>()}); - if (auto *M = unwrapIR(IR)) - return this->runOnModule(M, true); - if (auto *F = unwrapIR(IR)) - return this->runOnFunction(F, true); -} - -void DroppedVariableStats::runOnFunction(const Function *F, bool Before) { - auto &DebugVariables = DebugVariablesStack.back()[F]; - auto &VarIDSet = (Before ? DebugVariables.DebugVariablesBefore - : DebugVariables.DebugVariablesAfter); - auto &InlinedAtsMap = InlinedAts.back(); - auto FuncName = F->getName(); - if (Before) - InlinedAtsMap.try_emplace(FuncName, DenseMap()); - VarIDSet = DenseSet(); - for (const auto &I : instructions(F)) { - for (DbgRecord &DR : I.getDbgRecordRange()) { - if (auto *Dbg = dyn_cast(&DR)) { - auto *DbgVar = Dbg->getVariable(); - auto DbgLoc = DR.getDebugLoc(); - VarID Key{DbgVar->getScope(), DbgLoc->getInlinedAtScope(), DbgVar}; - VarIDSet.insert(Key); - if (Before) - InlinedAtsMap[FuncName].try_emplace(Key, DbgLoc.getInlinedAt()); - } - } - } -} - -void DroppedVariableStats::runOnModule(const Module *M, bool Before) { - for (auto &F : *M) - runOnFunction(&F, Before); -} - -void DroppedVariableStats::removeVarFromAllSets(VarID Var, const Function *F) { - // Do not remove Var from the last element, it will be popped from the stack. - for (auto &DebugVariablesMap : llvm::drop_end(DebugVariablesStack)) - DebugVariablesMap[F].DebugVariablesBefore.erase(Var); -} - -void DroppedVariableStats::calculateDroppedVarStatsOnModule( - const Module *M, StringRef PassID, std::string FuncOrModName, - std::string PassLevel) { - for (auto &F : *M) { - calculateDroppedVarStatsOnFunction(&F, PassID, FuncOrModName, PassLevel); - } -} - -void DroppedVariableStats::calculateDroppedVarStatsOnFunction( - const Function *F, StringRef PassID, std::string FuncOrModName, - std::string PassLevel) { - unsigned DroppedCount = 0; - StringRef FuncName = F->getName(); - DebugVariables &DbgVariables = DebugVariablesStack.back()[F]; - DenseSet &DebugVariablesBeforeSet = DbgVariables.DebugVariablesBefore; - DenseSet &DebugVariablesAfterSet = DbgVariables.DebugVariablesAfter; - DenseMap &InlinedAtsMap = InlinedAts.back()[FuncName]; - // Find an Instruction that shares the same scope as the dropped #dbg_value or - // has a scope that is the child of the scope of the #dbg_value, and has an - // inlinedAt equal to the inlinedAt of the #dbg_value or it's inlinedAt chain - // contains the inlinedAt of the #dbg_value, if such an Instruction is found, - // debug information is dropped. - for (VarID Var : DebugVariablesBeforeSet) { - if (DebugVariablesAfterSet.contains(Var)) - continue; - const DIScope *DbgValScope = std::get<0>(Var); - for (const auto &I : instructions(F)) { - auto *DbgLoc = I.getDebugLoc().get(); - if (!DbgLoc) - continue; - - auto *Scope = DbgLoc->getScope(); - if (isScopeChildOfOrEqualTo(Scope, DbgValScope)) { - if (isInlinedAtChildOfOrEqualTo(DbgLoc->getInlinedAt(), - InlinedAtsMap[Var])) { - // Found another instruction in the variable's scope, so there exists - // a break point at which the variable could be observed. Count it as - // dropped. - DroppedCount++; - break; - } - } - } - removeVarFromAllSets(Var, F); - } - if (DroppedCount > 0) { - llvm::outs() << PassLevel << ", " << PassID << ", " << DroppedCount << ", " - << FuncOrModName << "\n"; - PassDroppedVariables = true; - } else - PassDroppedVariables = false; -} - -void DroppedVariableStats::runAfterPassInvalidated( - StringRef PassID, const PreservedAnalyses &PA) { - DebugVariablesStack.pop_back(); - InlinedAts.pop_back(); -} - -void DroppedVariableStats::runAfterPass(StringRef PassID, Any IR, - const PreservedAnalyses &PA) { - std::string PassLevel; - std::string FuncOrModName; - if (auto *M = unwrapIR(IR)) { - this->runOnModule(M, false); - PassLevel = "Module"; - FuncOrModName = M->getName(); - calculateDroppedVarStatsOnModule(M, PassID, FuncOrModName, PassLevel); - } else if (auto *F = unwrapIR(IR)) { - this->runOnFunction(F, false); - PassLevel = "Function"; - FuncOrModName = F->getName(); - calculateDroppedVarStatsOnFunction(F, PassID, FuncOrModName, PassLevel); - } - - DebugVariablesStack.pop_back(); - InlinedAts.pop_back(); -} - -bool DroppedVariableStats::isScopeChildOfOrEqualTo(DIScope *Scope, - const DIScope *DbgValScope) { - while (Scope != nullptr) { - if (VisitedScope.find(Scope) == VisitedScope.end()) { - VisitedScope.insert(Scope); - if (Scope == DbgValScope) { - VisitedScope.clear(); - return true; - } - Scope = Scope->getScope(); - } else { - VisitedScope.clear(); - return false; - } - } - return false; -} - -bool DroppedVariableStats::isInlinedAtChildOfOrEqualTo( - const DILocation *InlinedAt, const DILocation *DbgValInlinedAt) { - if (DbgValInlinedAt == InlinedAt) - return true; - if (!DbgValInlinedAt) - return false; - if (!InlinedAt) - return false; - auto *IA = InlinedAt; - while (IA) { - if (IA == DbgValInlinedAt) - return true; - IA = IA->getInlinedAt(); - } - return false; -} - void StandardInstrumentations::registerCallbacks( PassInstrumentationCallbacks &PIC, ModuleAnalysisManager *MAM) { PrintIR.registerCallbacks(PIC); @@ -2712,7 +2538,7 @@ void StandardInstrumentations::registerCallbacks( WebsiteChangeReporter.registerCallbacks(PIC); ChangeTester.registerCallbacks(PIC); PrintCrashIR.registerCallbacks(PIC); - DroppedStats.registerCallbacks(PIC); + DroppedStatsIR.registerCallbacks(PIC); if (MAM) PreservedCFGChecker.registerCallbacks(PIC, *MAM); diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt index ed93ee547d223..ffa989bec269e 100644 --- a/llvm/unittests/IR/CMakeLists.txt +++ b/llvm/unittests/IR/CMakeLists.txt @@ -43,7 +43,7 @@ add_llvm_unittest(IRTests ShuffleVectorInstTest.cpp StructuralHashTest.cpp TimePassesTest.cpp - DroppedVariableStatsTest.cpp + DroppedVariableStatsIRTest.cpp TypesTest.cpp UseTest.cpp UserTest.cpp diff --git a/llvm/unittests/IR/DroppedVariableStatsTest.cpp b/llvm/unittests/IR/DroppedVariableStatsIRTest.cpp similarity index 91% rename from llvm/unittests/IR/DroppedVariableStatsTest.cpp rename to llvm/unittests/IR/DroppedVariableStatsIRTest.cpp index 61f3a87bb355e..34803a9771850 100644 --- a/llvm/unittests/IR/DroppedVariableStatsTest.cpp +++ b/llvm/unittests/IR/DroppedVariableStatsIRTest.cpp @@ -1,5 +1,4 @@ -//===- unittests/IR/DroppedVariableStatsTest.cpp - TimePassesHandler tests -//----------===// +//===- unittests/IR/DroppedVariableStatsIRTest.cpp ------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -44,7 +43,7 @@ namespace { // This test ensures that if a #dbg_value and an instruction that exists in the // same scope as that #dbg_value are both deleted as a result of an optimization // pass, debug information is considered not dropped. -TEST(DroppedVariableStats, BothDeleted) { +TEST(DroppedVariableStatsIR, BothDeleted) { PassInstrumentationCallbacks PIC; PassInstrumentation PI(&PIC); @@ -79,9 +78,8 @@ TEST(DroppedVariableStats, BothDeleted) { std::unique_ptr M = parseIR(C, IR); ASSERT_TRUE(M); - DroppedVariableStats Stats(true); - Stats.runBeforePass("Test", - llvm::Any(const_cast(M.get()))); + DroppedVariableStatsIR Stats(true); + Stats.runBeforePass(llvm::Any(const_cast(M.get()))); // This loop simulates an IR pass that drops debug information. for (auto &F : *M) { @@ -92,16 +90,15 @@ TEST(DroppedVariableStats, BothDeleted) { } break; } - PreservedAnalyses PA; Stats.runAfterPass("Test", - llvm::Any(const_cast(M.get())), PA); + llvm::Any(const_cast(M.get()))); ASSERT_EQ(Stats.getPassDroppedVariables(), false); } // This test ensures that if a #dbg_value is dropped after an optimization pass, // but an instruction that shares the same scope as the #dbg_value still exists, // debug information is conisdered dropped. -TEST(DroppedVariableStats, DbgValLost) { +TEST(DroppedVariableStatsIR, DbgValLost) { PassInstrumentationCallbacks PIC; PassInstrumentation PI(&PIC); @@ -136,9 +133,8 @@ TEST(DroppedVariableStats, DbgValLost) { std::unique_ptr M = parseIR(C, IR); ASSERT_TRUE(M); - DroppedVariableStats Stats(true); - Stats.runBeforePass("Test", - llvm::Any(const_cast(M.get()))); + DroppedVariableStatsIR Stats(true); + Stats.runBeforePass(llvm::Any(const_cast(M.get()))); // This loop simulates an IR pass that drops debug information. for (auto &F : *M) { @@ -148,16 +144,15 @@ TEST(DroppedVariableStats, DbgValLost) { } break; } - PreservedAnalyses PA; Stats.runAfterPass("Test", - llvm::Any(const_cast(M.get())), PA); + llvm::Any(const_cast(M.get()))); ASSERT_EQ(Stats.getPassDroppedVariables(), true); } // This test ensures that if a #dbg_value is dropped after an optimization pass, // but an instruction that has an unrelated scope as the #dbg_value still // exists, debug information is conisdered not dropped. -TEST(DroppedVariableStats, UnrelatedScopes) { +TEST(DroppedVariableStatsIR, UnrelatedScopes) { PassInstrumentationCallbacks PIC; PassInstrumentation PI(&PIC); @@ -193,9 +188,8 @@ TEST(DroppedVariableStats, UnrelatedScopes) { std::unique_ptr M = parseIR(C, IR); ASSERT_TRUE(M); - DroppedVariableStats Stats(true); - Stats.runBeforePass("Test", - llvm::Any(const_cast(M.get()))); + DroppedVariableStatsIR Stats(true); + Stats.runBeforePass(llvm::Any(const_cast(M.get()))); // This loop simulates an IR pass that drops debug information. for (auto &F : *M) { @@ -205,16 +199,15 @@ TEST(DroppedVariableStats, UnrelatedScopes) { } break; } - PreservedAnalyses PA; Stats.runAfterPass("Test", - llvm::Any(const_cast(M.get())), PA); + llvm::Any(const_cast(M.get()))); ASSERT_EQ(Stats.getPassDroppedVariables(), false); } // This test ensures that if a #dbg_value is dropped after an optimization pass, // but an instruction that has a scope which is a child of the #dbg_value scope // still exists, debug information is conisdered dropped. -TEST(DroppedVariableStats, ChildScopes) { +TEST(DroppedVariableStatsIR, ChildScopes) { PassInstrumentationCallbacks PIC; PassInstrumentation PI(&PIC); @@ -250,9 +243,8 @@ TEST(DroppedVariableStats, ChildScopes) { std::unique_ptr M = parseIR(C, IR); ASSERT_TRUE(M); - DroppedVariableStats Stats(true); - Stats.runBeforePass("Test", - llvm::Any(const_cast(M.get()))); + DroppedVariableStatsIR Stats(true); + Stats.runBeforePass(llvm::Any(const_cast(M.get()))); // This loop simulates an IR pass that drops debug information. for (auto &F : *M) { @@ -262,9 +254,8 @@ TEST(DroppedVariableStats, ChildScopes) { } break; } - PreservedAnalyses PA; Stats.runAfterPass("Test", - llvm::Any(const_cast(M.get())), PA); + llvm::Any(const_cast(M.get()))); ASSERT_EQ(Stats.getPassDroppedVariables(), true); } @@ -272,7 +263,7 @@ TEST(DroppedVariableStats, ChildScopes) { // but an instruction that has a scope which is a child of the #dbg_value scope // still exists, and the #dbg_value is inlined at another location, debug // information is conisdered not dropped. -TEST(DroppedVariableStats, InlinedAt) { +TEST(DroppedVariableStatsIR, InlinedAt) { PassInstrumentationCallbacks PIC; PassInstrumentation PI(&PIC); @@ -308,9 +299,8 @@ TEST(DroppedVariableStats, InlinedAt) { std::unique_ptr M = parseIR(C, IR); ASSERT_TRUE(M); - DroppedVariableStats Stats(true); - Stats.runBeforePass("Test", - llvm::Any(const_cast(M.get()))); + DroppedVariableStatsIR Stats(true); + Stats.runBeforePass(llvm::Any(const_cast(M.get()))); // This loop simulates an IR pass that drops debug information. for (auto &F : *M) { @@ -320,9 +310,8 @@ TEST(DroppedVariableStats, InlinedAt) { } break; } - PreservedAnalyses PA; Stats.runAfterPass("Test", - llvm::Any(const_cast(M.get())), PA); + llvm::Any(const_cast(M.get()))); ASSERT_EQ(Stats.getPassDroppedVariables(), false); } @@ -330,7 +319,7 @@ TEST(DroppedVariableStats, InlinedAt) { // but an instruction that has a scope which is a child of the #dbg_value scope // still exists, and the #dbg_value and the instruction are inlined at another // location, debug information is conisdered dropped. -TEST(DroppedVariableStats, InlinedAtShared) { +TEST(DroppedVariableStatsIR, InlinedAtShared) { PassInstrumentationCallbacks PIC; PassInstrumentation PI(&PIC); @@ -366,9 +355,8 @@ TEST(DroppedVariableStats, InlinedAtShared) { std::unique_ptr M = parseIR(C, IR); ASSERT_TRUE(M); - DroppedVariableStats Stats(true); - Stats.runBeforePass("Test", - llvm::Any(const_cast(M.get()))); + DroppedVariableStatsIR Stats(true); + Stats.runBeforePass(llvm::Any(const_cast(M.get()))); // This loop simulates an IR pass that drops debug information. for (auto &F : *M) { @@ -378,9 +366,8 @@ TEST(DroppedVariableStats, InlinedAtShared) { } break; } - PreservedAnalyses PA; Stats.runAfterPass("Test", - llvm::Any(const_cast(M.get())), PA); + llvm::Any(const_cast(M.get()))); ASSERT_EQ(Stats.getPassDroppedVariables(), true); } @@ -388,7 +375,7 @@ TEST(DroppedVariableStats, InlinedAtShared) { // but an instruction that has a scope which is a child of the #dbg_value scope // still exists, and the instruction is inlined at a location that is the // #dbg_value's inlined at location, debug information is conisdered dropped. -TEST(DroppedVariableStats, InlinedAtChild) { +TEST(DroppedVariableStatsIR, InlinedAtChild) { PassInstrumentationCallbacks PIC; PassInstrumentation PI(&PIC); @@ -425,9 +412,8 @@ TEST(DroppedVariableStats, InlinedAtChild) { std::unique_ptr M = parseIR(C, IR); ASSERT_TRUE(M); - DroppedVariableStats Stats(true); - Stats.runBeforePass("Test", - llvm::Any(const_cast(M.get()))); + DroppedVariableStatsIR Stats(true); + Stats.runBeforePass(llvm::Any(const_cast(M.get()))); // This loop simulates an IR pass that drops debug information. for (auto &F : *M) { @@ -437,9 +423,8 @@ TEST(DroppedVariableStats, InlinedAtChild) { } break; } - PreservedAnalyses PA; Stats.runAfterPass("Test", - llvm::Any(const_cast(M.get())), PA); + llvm::Any(const_cast(M.get()))); ASSERT_EQ(Stats.getPassDroppedVariables(), true); } From 3a3517c5e9d45a1d1aae5320887478b228b0f8be Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 19 Nov 2024 00:49:22 +0100 Subject: [PATCH 059/366] [libc++] Improve the tests for vector::erase (#116265) In particular, test everything with both a normal and a min_allocator, add tests for a few corner cases and add tests with types that are trivially relocatable. Also add tests that count the number of assignments performed by vector::erase, since that is mandated by the Standard. This patch is a preparation for optimizing vector::erase. --- libcxx/test/benchmarks/ContainerBenchmarks.h | 30 ++ libcxx/test/benchmarks/deque.bench.cpp | 11 + .../benchmarks/vector_operations.bench.cpp | 10 + .../vector/vector.modifiers/common.h | 86 ++++++ .../vector.modifiers/erase_iter.pass.cpp | 200 ++++++------- .../vector.modifiers/erase_iter_iter.pass.cpp | 274 ++++++++++-------- 6 files changed, 370 insertions(+), 241 deletions(-) create mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/common.h diff --git a/libcxx/test/benchmarks/ContainerBenchmarks.h b/libcxx/test/benchmarks/ContainerBenchmarks.h index 742c848328604..38e11777f488b 100644 --- a/libcxx/test/benchmarks/ContainerBenchmarks.h +++ b/libcxx/test/benchmarks/ContainerBenchmarks.h @@ -11,6 +11,8 @@ #define BENCHMARK_CONTAINER_BENCHMARKS_H #include +#include +#include #include "Utilities.h" #include "benchmark/benchmark.h" @@ -149,6 +151,34 @@ void BM_EmplaceDuplicate(benchmark::State& st, Container c, GenInputs gen) { } } +template +void BM_erase_iter_in_middle(benchmark::State& st, Container, GenInputs gen) { + auto in = gen(st.range(0)); + Container c(in.begin(), in.end()); + assert(c.size() > 2); + for (auto _ : st) { + auto mid = std::next(c.begin(), c.size() / 2); + auto tmp = *mid; + auto result = c.erase(mid); // erase an element in the middle + benchmark::DoNotOptimize(result); + c.push_back(std::move(tmp)); // and then push it back at the end to avoid needing a new container + } +} + +template +void BM_erase_iter_at_start(benchmark::State& st, Container, GenInputs gen) { + auto in = gen(st.range(0)); + Container c(in.begin(), in.end()); + assert(c.size() > 2); + for (auto _ : st) { + auto it = c.begin(); + auto tmp = *it; + auto result = c.erase(it); // erase the first element + benchmark::DoNotOptimize(result); + c.push_back(std::move(tmp)); // and then push it back at the end to avoid needing a new container + } +} + template void BM_Find(benchmark::State& st, Container c, GenInputs gen) { auto in = gen(st.range(0)); diff --git a/libcxx/test/benchmarks/deque.bench.cpp b/libcxx/test/benchmarks/deque.bench.cpp index b8f3b76dd27ee..ab0ba96b12ffc 100644 --- a/libcxx/test/benchmarks/deque.bench.cpp +++ b/libcxx/test/benchmarks/deque.bench.cpp @@ -9,6 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 #include +#include #include "benchmark/benchmark.h" @@ -41,4 +42,14 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_size_t, std::deque{}, get BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_string, std::deque{}, getRandomStringInputs) ->Arg(TestNumInputs); +BENCHMARK_CAPTURE(BM_erase_iter_in_middle, deque_int, std::deque{}, getRandomIntegerInputs) + ->Range(TestNumInputs, TestNumInputs * 10); +BENCHMARK_CAPTURE(BM_erase_iter_in_middle, deque_string, std::deque{}, getRandomStringInputs) + ->Range(TestNumInputs, TestNumInputs * 10); + +BENCHMARK_CAPTURE(BM_erase_iter_at_start, deque_int, std::deque{}, getRandomIntegerInputs) + ->Range(TestNumInputs, TestNumInputs * 10); +BENCHMARK_CAPTURE(BM_erase_iter_at_start, deque_string, std::deque{}, getRandomStringInputs) + ->Range(TestNumInputs, TestNumInputs * 10); + BENCHMARK_MAIN(); diff --git a/libcxx/test/benchmarks/vector_operations.bench.cpp b/libcxx/test/benchmarks/vector_operations.bench.cpp index ce8ab233fc981..1855861263324 100644 --- a/libcxx/test/benchmarks/vector_operations.bench.cpp +++ b/libcxx/test/benchmarks/vector_operations.bench.cpp @@ -54,6 +54,16 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_string, std::vector BENCHMARK_CAPTURE(BM_Pushback_no_grow, vector_int, std::vector{})->Arg(TestNumInputs); +BENCHMARK_CAPTURE(BM_erase_iter_in_middle, vector_int, std::vector{}, getRandomIntegerInputs) + ->Range(TestNumInputs, TestNumInputs * 10); +BENCHMARK_CAPTURE(BM_erase_iter_in_middle, vector_string, std::vector{}, getRandomStringInputs) + ->Range(TestNumInputs, TestNumInputs * 10); + +BENCHMARK_CAPTURE(BM_erase_iter_at_start, vector_int, std::vector{}, getRandomIntegerInputs) + ->Range(TestNumInputs, TestNumInputs * 10); +BENCHMARK_CAPTURE(BM_erase_iter_at_start, vector_string, std::vector{}, getRandomStringInputs) + ->Range(TestNumInputs, TestNumInputs * 10); + template void bm_grow(benchmark::State& state) { for (auto _ : state) { diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/common.h b/libcxx/test/std/containers/sequences/vector/vector.modifiers/common.h new file mode 100644 index 0000000000000..72cd47a50b2c0 --- /dev/null +++ b/libcxx/test/std/containers/sequences/vector/vector.modifiers/common.h @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_STD_CONTAINERS_SEQUENCES_VECTOR_VECTOR_MODIFIERS_COMMON_H +#define TEST_STD_CONTAINERS_SEQUENCES_VECTOR_VECTOR_MODIFIERS_COMMON_H + +#include "test_macros.h" + +#include // for __libcpp_is_trivially_relocatable + +#ifndef TEST_HAS_NO_EXCEPTIONS +struct Throws { + Throws() : v_(0) {} + Throws(int v) : v_(v) {} + Throws(const Throws& rhs) : v_(rhs.v_) { + if (sThrows) + throw 1; + } + Throws(Throws&& rhs) : v_(rhs.v_) { + if (sThrows) + throw 1; + } + Throws& operator=(const Throws& rhs) { + v_ = rhs.v_; + return *this; + } + Throws& operator=(Throws&& rhs) { + v_ = rhs.v_; + return *this; + } + int v_; + static bool sThrows; +}; + +bool Throws::sThrows = false; +#endif + +struct Tracker { + int copy_assignments = 0; + int move_assignments = 0; +}; + +struct TrackedAssignment { + Tracker* tracker_; + TEST_CONSTEXPR_CXX14 explicit TrackedAssignment(Tracker* tracker) : tracker_(tracker) {} + + TrackedAssignment(TrackedAssignment const&) = default; + TrackedAssignment(TrackedAssignment&&) = default; + + TEST_CONSTEXPR_CXX14 TrackedAssignment& operator=(TrackedAssignment const&) { + tracker_->copy_assignments++; + return *this; + } + TEST_CONSTEXPR_CXX14 TrackedAssignment& operator=(TrackedAssignment&&) { + tracker_->move_assignments++; + return *this; + } +}; + +struct NonTriviallyRelocatable { + int value_; + TEST_CONSTEXPR NonTriviallyRelocatable() : value_(0) {} + TEST_CONSTEXPR explicit NonTriviallyRelocatable(int v) : value_(v) {} + TEST_CONSTEXPR NonTriviallyRelocatable(NonTriviallyRelocatable const& other) : value_(other.value_) {} + TEST_CONSTEXPR NonTriviallyRelocatable(NonTriviallyRelocatable&& other) : value_(other.value_) {} + TEST_CONSTEXPR_CXX14 NonTriviallyRelocatable& operator=(NonTriviallyRelocatable const& other) { + value_ = other.value_; + return *this; + } + TEST_CONSTEXPR_CXX14 NonTriviallyRelocatable& operator=(NonTriviallyRelocatable&& other) { + value_ = other.value_; + return *this; + } + + TEST_CONSTEXPR_CXX14 friend bool operator==(NonTriviallyRelocatable const& a, NonTriviallyRelocatable const& b) { + return a.value_ == b.value_; + } +}; +LIBCPP_STATIC_ASSERT(!std::__libcpp_is_trivially_relocatable::value, ""); + +#endif // TEST_STD_CONTAINERS_SEQUENCES_VECTOR_VECTOR_MODIFIERS_COMMON_H diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.pass.cpp index 549f29a8f7ba1..f0157eb74b90f 100644 --- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.pass.cpp @@ -11,135 +11,79 @@ // iterator erase(const_iterator position); #include -#include #include +#include #include "asan_testing.h" +#include "common.h" #include "min_allocator.h" #include "MoveOnly.h" #include "test_macros.h" -#ifndef TEST_HAS_NO_EXCEPTIONS -struct Throws { - Throws() : v_(0) {} - Throws(int v) : v_(v) {} - Throws(const Throws& rhs) : v_(rhs.v_) { - if (sThrows) - throw 1; - } - Throws(Throws&& rhs) : v_(rhs.v_) { - if (sThrows) - throw 1; - } - Throws& operator=(const Throws& rhs) { - v_ = rhs.v_; - return *this; - } - Throws& operator=(Throws&& rhs) { - v_ = rhs.v_; - return *this; - } - int v_; - static bool sThrows; -}; - -bool Throws::sThrows = false; -#endif - -TEST_CONSTEXPR_CXX20 bool tests() { - { - int a1[] = {1, 2, 3, 4, 5}; - std::vector l1(a1, a1 + 5); - l1.erase(l1.begin()); - assert(is_contiguous_container_asan_correct(l1)); - assert(l1 == std::vector(a1 + 1, a1 + 5)); - } +template