diff --git a/.github/new-issues-labeler.yml b/.github/new-issues-labeler.yml
index a5933d7fc9b37..860535bbe3083 100644
--- a/.github/new-issues-labeler.yml
+++ b/.github/new-issues-labeler.yml
@@ -27,3 +27,6 @@
 
 'bolt':
   - '/\bbolt(?!\-)\b/i'
+
+'infra:commit-access-request':
+  - '/Request Commit Access/'
diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h
index d362961176b32..1093f6ad78a99 100644
--- a/bolt/include/bolt/Core/BinarySection.h
+++ b/bolt/include/bolt/Core/BinarySection.h
@@ -87,6 +87,7 @@ class BinarySection {
                                    // been renamed)
   uint64_t OutputAddress{0};       // Section address for the rewritten binary.
   uint64_t OutputSize{0};          // Section size in the rewritten binary.
+                                   // Can exceed OutputContents with padding.
   uint64_t OutputFileOffset{0};    // File offset in the rewritten binary file.
   StringRef OutputContents;        // Rewritten section contents.
   const uint64_t SectionNumber;    // Order in which the section was created.
@@ -474,6 +475,11 @@ class BinarySection {
   /// Use name \p SectionName for the section during the emission.
   void emitAsData(MCStreamer &Streamer, const Twine &SectionName) const;
 
+  /// Write finalized contents of the section. If OutputSize exceeds the size of
+  /// the OutputContents, append zero padding to the stream and return the
+  /// number of byte written which should match the OutputSize.
+  uint64_t write(raw_ostream &OS) const;
+
   using SymbolResolverFuncTy = llvm::function_ref<uint64_t(const MCSymbol *)>;
 
   /// Flush all pending relocations to patch original contents of sections
@@ -497,6 +503,9 @@ class BinarySection {
     IsFinalized = true;
   }
 
+  /// When writing section contents, add \p PaddingSize zero bytes at the end.
+  void addPadding(uint64_t PaddingSize) { OutputSize += PaddingSize; }
+
   /// Reorder the contents of this section according to /p Order.  If
   /// /p Inplace is true, the entire contents of the section is reordered,
   /// otherwise the new contents contain only the reordered data.
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index bdf784ec7b6f3..4b5d8154728cc 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -416,17 +416,6 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
     BF.duplicateConstantIslands();
   }
 
-  if (!FF.empty() && FF.front()->isLandingPad()) {
-    assert(!FF.front()->isEntryPoint() &&
-           "Landing pad cannot be entry point of function");
-    // If the first block of the fragment is a landing pad, it's offset from the
-    // start of the area that the corresponding LSDA describes is zero. In this
-    // case, the call site entries in that LSDA have 0 as offset to the landing
-    // pad, which the runtime interprets as "no handler". To prevent this,
-    // insert some padding.
-    Streamer.emitBytes(BC.MIB->getTrapFillValue());
-  }
-
   // Track the first emitted instruction with debug info.
   bool FirstInstr = true;
   for (BinaryBasicBlock *const BB : FF) {
@@ -926,39 +915,54 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, const FunctionFragment &FF) {
   // Emit the LSDA header.
 
   // If LPStart is omitted, then the start of the FDE is used as a base for
-  // landing pad displacements. Then if a cold fragment starts with
-  // a landing pad, this means that the first landing pad offset will be 0.
-  // As a result, the exception handling runtime will ignore this landing pad
-  // because zero offset denotes the absence of a landing pad.
-  // For this reason, when the binary has fixed starting address we emit LPStart
-  // as 0 and output the absolute value of the landing pad in the table.
+  // landing pad displacements. Then, if a cold fragment starts with a landing
+  // pad, this means that the first landing pad offset will be 0. However, C++
+  // runtime treats 0 as if there is no landing pad present, thus we *must* emit
+  // non-zero offsets for all valid LPs.
   //
-  // If the base address can change, we cannot use absolute addresses for
-  // landing pads (at least not without runtime relocations). Hence, we fall
-  // back to emitting landing pads relative to the FDE start.
-  // As we are emitting label differences, we have to guarantee both labels are
-  // defined in the same section and hence cannot place the landing pad into a
-  // cold fragment when the corresponding call site is in the hot fragment.
-  // Because of this issue and the previously described issue of possible
-  // zero-offset landing pad we have to place landing pads in the same section
-  // as the corresponding invokes for shared objects.
+  // As a solution, for fixed-address binaries we set LPStart to 0, and for
+  // position-independent binaries we set LP start to FDE start minus one byte
+  // for FDEs that start with a landing pad.
+  const bool NeedsLPAdjustment = !FF.empty() && FF.front()->isLandingPad();
   std::function<void(const MCSymbol *)> emitLandingPad;
   if (BC.HasFixedLoadAddress) {
     Streamer.emitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
     Streamer.emitIntValue(0, 4);                      // LPStart
     emitLandingPad = [&](const MCSymbol *LPSymbol) {
-      if (!LPSymbol)
-        Streamer.emitIntValue(0, 4);
-      else
+      if (LPSymbol)
         Streamer.emitSymbolValue(LPSymbol, 4);
+      else
+        Streamer.emitIntValue(0, 4);
     };
   } else {
-    Streamer.emitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format
+    if (NeedsLPAdjustment) {
+      // Use relative LPStart format and emit LPStart as [SymbolStart - 1].
+      Streamer.emitIntValue(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4, 1);
+      MCSymbol *DotSymbol = BC.Ctx->createTempSymbol("LPBase");
+      Streamer.emitLabel(DotSymbol);
+
+      const MCExpr *LPStartExpr = MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(StartSymbol, *BC.Ctx),
+          MCSymbolRefExpr::create(DotSymbol, *BC.Ctx), *BC.Ctx);
+      LPStartExpr = MCBinaryExpr::createSub(
+          LPStartExpr, MCConstantExpr::create(1, *BC.Ctx), *BC.Ctx);
+      Streamer.emitValue(LPStartExpr, 4);
+    } else {
+      // DW_EH_PE_omit means FDE start (StartSymbol) will be used as LPStart.
+      Streamer.emitIntValue(dwarf::DW_EH_PE_omit, 1);
+    }
     emitLandingPad = [&](const MCSymbol *LPSymbol) {
-      if (!LPSymbol)
-        Streamer.emitIntValue(0, 4);
-      else
-        Streamer.emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4);
+      if (LPSymbol) {
+        const MCExpr *LPOffsetExpr = MCBinaryExpr::createSub(
+            MCSymbolRefExpr::create(LPSymbol, *BC.Ctx),
+            MCSymbolRefExpr::create(StartSymbol, *BC.Ctx), *BC.Ctx);
+        if (NeedsLPAdjustment)
+          LPOffsetExpr = MCBinaryExpr::createAdd(
+              LPOffsetExpr, MCConstantExpr::create(1, *BC.Ctx), *BC.Ctx);
+        Streamer.emitULEB128Value(LPOffsetExpr);
+      } else {
+        Streamer.emitULEB128IntValue(0);
+      }
     };
   }
 
@@ -972,10 +976,12 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, const FunctionFragment &FF) {
     Streamer.emitLabel(TTBaseRefLabel);
   }
 
-  // Emit the landing pad call site table. We use signed data4 since we can emit
-  // a landing pad in a different part of the split function that could appear
-  // earlier in the address space than LPStart.
-  Streamer.emitIntValue(dwarf::DW_EH_PE_sdata4, 1);
+  // Emit encoding of entries in the call site table. The format is used for the
+  // call site start, length, and corresponding landing pad.
+  if (BC.HasFixedLoadAddress)
+    Streamer.emitIntValue(dwarf::DW_EH_PE_sdata4, 1);
+  else
+    Streamer.emitIntValue(dwarf::DW_EH_PE_uleb128, 1);
 
   MCSymbol *CSTStartLabel = BC.Ctx->createTempSymbol("CSTStart");
   MCSymbol *CSTEndLabel = BC.Ctx->createTempSymbol("CSTEnd");
@@ -992,8 +998,13 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, const FunctionFragment &FF) {
 
     // Start of the range is emitted relative to the start of current
     // function split part.
-    Streamer.emitAbsoluteSymbolDiff(BeginLabel, StartSymbol, 4);
-    Streamer.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4);
+    if (BC.HasFixedLoadAddress) {
+      Streamer.emitAbsoluteSymbolDiff(BeginLabel, StartSymbol, 4);
+      Streamer.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4);
+    } else {
+      Streamer.emitAbsoluteSymbolDiffAsULEB128(BeginLabel, StartSymbol);
+      Streamer.emitAbsoluteSymbolDiffAsULEB128(EndLabel, BeginLabel);
+    }
     emitLandingPad(CallSite.LP);
     Streamer.emitULEB128IntValue(CallSite.Action);
   }
diff --git a/bolt/lib/Core/BinarySection.cpp b/bolt/lib/Core/BinarySection.cpp
index 9ad49ca1b3a03..b16e0a4333aa2 100644
--- a/bolt/lib/Core/BinarySection.cpp
+++ b/bolt/lib/Core/BinarySection.cpp
@@ -142,6 +142,15 @@ void BinarySection::emitAsData(MCStreamer &Streamer,
     Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_end"));
 }
 
+uint64_t BinarySection::write(raw_ostream &OS) const {
+  const uint64_t NumValidContentBytes =
+      std::min<uint64_t>(getOutputContents().size(), getOutputSize());
+  OS.write(getOutputContents().data(), NumValidContentBytes);
+  if (getOutputSize() > NumValidContentBytes)
+    OS.write_zeros(getOutputSize() - NumValidContentBytes);
+  return getOutputSize();
+}
+
 void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS,
                                             SymbolResolverFuncTy Resolver) {
   if (PendingRelocations.empty() && Patches.empty())
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 1fcf2bb959bbb..7059a3dd23109 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -3887,6 +3887,43 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
 
 void RewriteInstance::mapAllocatableSections(
     BOLTLinker::SectionMapper MapSection) {
+
+  if (opts::UseOldText || opts::StrictMode) {
+    auto tryRewriteSection = [&](BinarySection &OldSection,
+                                 BinarySection &NewSection) {
+      if (OldSection.getSize() < NewSection.getOutputSize())
+        return;
+
+      BC->outs() << "BOLT-INFO: rewriting " << OldSection.getName()
+                 << " in-place\n";
+
+      NewSection.setOutputAddress(OldSection.getAddress());
+      NewSection.setOutputFileOffset(OldSection.getInputFileOffset());
+      MapSection(NewSection, OldSection.getAddress());
+
+      // Pad contents with zeros.
+      NewSection.addPadding(OldSection.getSize() - NewSection.getOutputSize());
+
+      // Prevent the original section name from appearing in the section header
+      // table.
+      OldSection.setAnonymous(true);
+    };
+
+    if (EHFrameSection) {
+      BinarySection *NewEHFrameSection =
+          getSection(getNewSecPrefix() + getEHFrameSectionName());
+      assert(NewEHFrameSection && "New contents expected for .eh_frame");
+      tryRewriteSection(*EHFrameSection, *NewEHFrameSection);
+    }
+    BinarySection *EHSection = getSection(".gcc_except_table");
+    BinarySection *NewEHSection =
+        getSection(getNewSecPrefix() + ".gcc_except_table");
+    if (EHSection) {
+      assert(NewEHSection && "New contents expected for .gcc_except_table");
+      tryRewriteSection(*EHSection, *NewEHSection);
+    }
+  }
+
   // Allocate read-only sections first, then writable sections.
   enum : uint8_t { ST_READONLY, ST_READWRITE };
   for (uint8_t SType = ST_READONLY; SType <= ST_READWRITE; ++SType) {
@@ -4164,7 +4201,6 @@ void RewriteInstance::rewriteNoteSections() {
     // New section size.
     uint64_t Size = 0;
     bool DataWritten = false;
-    uint8_t *SectionData = nullptr;
     // Copy over section contents unless it's one of the sections we overwrite.
     if (!willOverwriteSection(SectionName)) {
       Size = Section.sh_size;
@@ -4196,12 +4232,7 @@ void RewriteInstance::rewriteNoteSections() {
     if (BSec->getAllocAddress()) {
       assert(!DataWritten && "Writing section twice.");
       (void)DataWritten;
-      SectionData = BSec->getOutputData();
-
-      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: " << (Size ? "appending" : "writing")
-                        << " contents to section " << SectionName << '\n');
-      OS.write(reinterpret_cast<char *>(SectionData), BSec->getOutputSize());
-      Size += BSec->getOutputSize();
+      Size += BSec->write(OS);
     }
 
     BSec->setOutputFileOffset(NextAvailableOffset);
@@ -4232,8 +4263,7 @@ void RewriteInstance::rewriteNoteSections() {
                << " of size " << Section.getOutputSize() << " at offset 0x"
                << Twine::utohexstr(Section.getOutputFileOffset()) << '\n');
 
-    OS.write(Section.getOutputContents().data(), Section.getOutputSize());
-    NextAvailableOffset += Section.getOutputSize();
+    NextAvailableOffset += Section.write(OS);
   }
 }
 
@@ -4347,6 +4377,10 @@ RewriteInstance::getOutputSections(ELFObjectFile<ELFT> *File,
     BinarySection *BinSec = BC->getSectionForSectionRef(SecRef);
     assert(BinSec && "Matching BinarySection should exist.");
 
+    // Exclude anonymous sections.
+    if (BinSec->isAnonymous())
+      continue;
+
     addSection(Section, *BinSec);
   }
 
@@ -5699,8 +5733,8 @@ void RewriteInstance::rewriteFile() {
                  << Twine::utohexstr(Section.getAllocAddress()) << "\n of size "
                  << Section.getOutputSize() << "\n at offset "
                  << Section.getOutputFileOffset() << '\n';
-    OS.pwrite(reinterpret_cast<const char *>(Section.getOutputData()),
-              Section.getOutputSize(), Section.getOutputFileOffset());
+    OS.seek(Section.getOutputFileOffset());
+    Section.write(OS);
   }
 
   for (BinarySection &Section : BC->allocatableSections())
@@ -5791,42 +5825,64 @@ void RewriteInstance::writeEHFrameHeader() {
   LLVM_DEBUG(dbgs() << "BOLT: writing a new " << getEHFrameHdrSectionName()
                     << '\n');
 
-  NextAvailableAddress =
-      appendPadding(Out->os(), NextAvailableAddress, EHFrameHdrAlign);
+  // Try to overwrite the original .eh_frame_hdr if the size permits.
+  uint64_t EHFrameHdrOutputAddress = 0;
+  uint64_t EHFrameHdrFileOffset = 0;
+  std::vector<char> NewEHFrameHdr;
+  BinarySection *OldEHFrameHdrSection = getSection(getEHFrameHdrSectionName());
+  if (OldEHFrameHdrSection) {
+    NewEHFrameHdr = CFIRdWrt->generateEHFrameHeader(
+        RelocatedEHFrame, NewEHFrame, OldEHFrameHdrSection->getAddress());
+    if (NewEHFrameHdr.size() <= OldEHFrameHdrSection->getSize()) {
+      BC->outs() << "BOLT-INFO: rewriting " << getEHFrameHdrSectionName()
+                 << " in-place\n";
+      EHFrameHdrOutputAddress = OldEHFrameHdrSection->getAddress();
+      EHFrameHdrFileOffset = OldEHFrameHdrSection->getInputFileOffset();
+    } else {
+      OldEHFrameHdrSection->setOutputName(getOrgSecPrefix() +
+                                          getEHFrameHdrSectionName());
+      OldEHFrameHdrSection = nullptr;
+    }
+  }
 
-  const uint64_t EHFrameHdrOutputAddress = NextAvailableAddress;
-  const uint64_t EHFrameHdrFileOffset =
-      getFileOffsetForAddress(NextAvailableAddress);
+  // If there was not enough space, allocate more memory for .eh_frame_hdr.
+  if (!OldEHFrameHdrSection) {
+    NextAvailableAddress =
+        appendPadding(Out->os(), NextAvailableAddress, EHFrameHdrAlign);
 
-  std::vector<char> NewEHFrameHdr = CFIRdWrt->generateEHFrameHeader(
-      RelocatedEHFrame, NewEHFrame, EHFrameHdrOutputAddress);
+    EHFrameHdrOutputAddress = NextAvailableAddress;
+    EHFrameHdrFileOffset = getFileOffsetForAddress(NextAvailableAddress);
+
+    NewEHFrameHdr = CFIRdWrt->generateEHFrameHeader(
+        RelocatedEHFrame, NewEHFrame, EHFrameHdrOutputAddress);
+
+    NextAvailableAddress += NewEHFrameHdr.size();
+    if (!BC->BOLTReserved.empty() &&
+        (NextAvailableAddress > BC->BOLTReserved.end())) {
+      BC->errs() << "BOLT-ERROR: unable to fit " << getEHFrameHdrSectionName()
+                 << " into reserved space\n";
+      exit(1);
+    }
+
+    // Create a new entry in the section header table.
+    const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
+                                                   /*IsText=*/false,
+                                                   /*IsAllocatable=*/true);
+    BinarySection &EHFrameHdrSec = BC->registerOrUpdateSection(
+        getNewSecPrefix() + getEHFrameHdrSectionName(), ELF::SHT_PROGBITS,
+        Flags, nullptr, NewEHFrameHdr.size(), /*Alignment=*/1);
+    EHFrameHdrSec.setOutputFileOffset(EHFrameHdrFileOffset);
+    EHFrameHdrSec.setOutputAddress(EHFrameHdrOutputAddress);
+    EHFrameHdrSec.setOutputName(getEHFrameHdrSectionName());
+  }
 
   Out->os().seek(EHFrameHdrFileOffset);
   Out->os().write(NewEHFrameHdr.data(), NewEHFrameHdr.size());
 
-  const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
-                                                 /*IsText=*/false,
-                                                 /*IsAllocatable=*/true);
-  BinarySection *OldEHFrameHdrSection = getSection(getEHFrameHdrSectionName());
+  // Pad the contents if overwriting in-place.
   if (OldEHFrameHdrSection)
-    OldEHFrameHdrSection->setOutputName(getOrgSecPrefix() +
-                                        getEHFrameHdrSectionName());
-
-  BinarySection &EHFrameHdrSec = BC->registerOrUpdateSection(
-      getNewSecPrefix() + getEHFrameHdrSectionName(), ELF::SHT_PROGBITS, Flags,
-      nullptr, NewEHFrameHdr.size(), /*Alignment=*/1);
-  EHFrameHdrSec.setOutputFileOffset(EHFrameHdrFileOffset);
-  EHFrameHdrSec.setOutputAddress(EHFrameHdrOutputAddress);
-  EHFrameHdrSec.setOutputName(getEHFrameHdrSectionName());
-
-  NextAvailableAddress += EHFrameHdrSec.getOutputSize();
-
-  if (!BC->BOLTReserved.empty() &&
-      (NextAvailableAddress > BC->BOLTReserved.end())) {
-    BC->errs() << "BOLT-ERROR: unable to fit " << getEHFrameHdrSectionName()
-               << " into reserved space\n";
-    exit(1);
-  }
+    Out->os().write_zeros(OldEHFrameHdrSection->getSize() -
+                          NewEHFrameHdr.size());
 
   // Merge new .eh_frame with the relocated original so that gdb can locate all
   // FDEs.
diff --git a/bolt/test/eh-frame-hdr.test b/bolt/test/eh-frame-hdr.test
new file mode 100644
index 0000000000000..4d718c850e2f2
--- /dev/null
+++ b/bolt/test/eh-frame-hdr.test
@@ -0,0 +1,12 @@
+# Check that llvm-bolt overwrites .eh_frame_hdr in-place.
+
+REQUIRES: system-linux
+
+RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --use-old-text \
+RUN:   | FileCheck %s --check-prefix=CHECK-BOLT
+RUN: llvm-readelf -WS %t.bolt | FileCheck %s
+
+CHECK-BOLT: rewriting .eh_frame_hdr in-place
+
+CHECK-NOT: .bolt.org.eh_frame_hdr
diff --git a/bolt/test/eh-frame-overwrite.test b/bolt/test/eh-frame-overwrite.test
new file mode 100644
index 0000000000000..649d739ec6086
--- /dev/null
+++ b/bolt/test/eh-frame-overwrite.test
@@ -0,0 +1,8 @@
+# Check that llvm-bolt can overwrite .eh_frame section in-place.
+
+REQUIRES: system-linux
+
+RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --strict | FileCheck %s
+
+CHECK: rewriting .eh_frame in-place
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
index 620a57194acb8..3d1f63fcf33a5 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
@@ -20,7 +20,7 @@ namespace {
 
 AST_MATCHER_P(FunctionDecl, isEnabled, llvm::StringSet<>,
               FunctionsThatShouldNotThrow) {
-  return FunctionsThatShouldNotThrow.count(Node.getNameAsString()) > 0;
+  return FunctionsThatShouldNotThrow.contains(Node.getNameAsString());
 }
 
 AST_MATCHER(FunctionDecl, isExplicitThrow) {
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
index 9bfb7e2677533..68f3ecf6bdaa8 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
@@ -418,7 +418,7 @@ ExceptionAnalyzer::ExceptionInfo::filterIgnoredExceptions(
       if (TD->getDeclName().isIdentifier()) {
         if ((IgnoreBadAlloc &&
              (TD->getName() == "bad_alloc" && TD->isInStdNamespace())) ||
-            (IgnoredTypes.count(TD->getName()) > 0))
+            (IgnoredTypes.contains(TD->getName())))
           TypesToDelete.push_back(T);
       }
     }
@@ -449,7 +449,8 @@ void ExceptionAnalyzer::ExceptionInfo::reevaluateBehaviour() {
 ExceptionAnalyzer::ExceptionInfo ExceptionAnalyzer::throwsException(
     const FunctionDecl *Func, const ExceptionInfo::Throwables &Caught,
     llvm::SmallSet<const FunctionDecl *, 32> &CallStack) {
-  if (!Func || CallStack.count(Func) || (!CallStack.empty() && !canThrow(Func)))
+  if (!Func || CallStack.contains(Func) ||
+      (!CallStack.empty() && !canThrow(Func)))
     return ExceptionInfo::createNonThrowing();
 
   if (const Stmt *Body = Func->getBody()) {
@@ -507,7 +508,7 @@ ExceptionAnalyzer::ExceptionInfo ExceptionAnalyzer::throwsException(
     for (unsigned I = 0; I < Try->getNumHandlers(); ++I) {
       const CXXCatchStmt *Catch = Try->getHandler(I);
 
-      // Everything is catched through 'catch(...)'.
+      // Everything is caught through 'catch(...)'.
       if (!Catch->getExceptionDecl()) {
         ExceptionInfo Rethrown = throwsException(
             Catch->getHandlerBlock(), Uncaught.getExceptionTypes(), CallStack);
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
index 0a8cf8668d3ca..6c2d693d64b50 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
@@ -101,8 +101,8 @@ class ExceptionAnalyzer {
     /// Recalculate the 'Behaviour' for example after filtering.
     void reevaluateBehaviour();
 
-    /// Keep track if the entity related to this 'ExceptionInfo' can in princple
-    /// throw, if it's unknown or if it won't throw.
+    /// Keep track if the entity related to this 'ExceptionInfo' can in
+    /// principle throw, if it's unknown or if it won't throw.
     State Behaviour;
 
     /// Keep track if the entity contains any unknown elements to keep track
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index 761f96846d453..05c8041df7de7 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -511,6 +511,35 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R,
         if (auto EditsNearCursor = Completion->getBoolean("editsNearCursor"))
           R.CompletionFixes |= *EditsNearCursor;
       }
+      if (auto *References = TextDocument->getObject("references")) {
+        if (auto ContainerSupport = References->getBoolean("container")) {
+          R.ReferenceContainer |= *ContainerSupport;
+        }
+      }
+      if (auto *Diagnostics = TextDocument->getObject("publishDiagnostics")) {
+        if (auto CodeActions = Diagnostics->getBoolean("codeActionsInline")) {
+          R.DiagnosticFixes |= *CodeActions;
+        }
+      }
+      if (auto *InactiveRegions =
+              TextDocument->getObject("inactiveRegionsCapabilities")) {
+        if (auto InactiveRegionsSupport =
+                InactiveRegions->getBoolean("inactiveRegions")) {
+          R.InactiveRegions |= *InactiveRegionsSupport;
+        }
+      }
+    }
+    if (auto *Window = Experimental->getObject("window")) {
+      if (auto Implicit =
+              Window->getBoolean("implicitWorkDoneProgressCreate")) {
+        R.ImplicitProgressCreation |= *Implicit;
+      }
+    }
+    if (auto *OffsetEncoding = Experimental->get("offsetEncoding")) {
+      R.offsetEncoding.emplace();
+      if (!fromJSON(*OffsetEncoding, *R.offsetEncoding,
+                    P.field("offsetEncoding")))
+        return false;
     }
   }
 
diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h
index 5b28095758198..c7ef1a13e6e39 100644
--- a/clang-tools-extra/clangd/Protocol.h
+++ b/clang-tools-extra/clangd/Protocol.h
@@ -452,6 +452,7 @@ struct ClientCapabilities {
   std::optional<SymbolKindBitset> WorkspaceSymbolKinds;
 
   /// Whether the client accepts diagnostics with codeActions attached inline.
+  /// This is a clangd extension.
   /// textDocument.publishDiagnostics.codeActionsInline.
   bool DiagnosticFixes = false;
 
@@ -475,6 +476,7 @@ struct ClientCapabilities {
 
   /// Client supports displaying a container string for results of
   /// textDocument/reference (clangd extension)
+  /// textDocument.references.container
   bool ReferenceContainer = false;
 
   /// Client supports hierarchical document symbols.
@@ -563,6 +565,7 @@ struct ClientCapabilities {
 
   /// Whether the client supports the textDocument/inactiveRegions
   /// notification. This is a clangd extension.
+  /// textDocument.inactiveRegionsCapabilities.inactiveRegions
   bool InactiveRegions = false;
 };
 bool fromJSON(const llvm::json::Value &, ClientCapabilities &,
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 4fd11307857ff..61fa66180376c 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -63,6 +63,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -2275,7 +2276,7 @@ incomingCalls(const CallHierarchyItem &Item, const SymbolIndex *Index) {
   // Initially store the ranges in a map keyed by SymbolID of the caller.
   // This allows us to group different calls with the same caller
   // into the same CallHierarchyIncomingCall.
-  llvm::DenseMap<SymbolID, std::vector<Range>> CallsIn;
+  llvm::DenseMap<SymbolID, std::vector<Location>> CallsIn;
   // We can populate the ranges based on a refs request only. As we do so, we
   // also accumulate the container IDs into a lookup request.
   LookupRequest ContainerLookup;
@@ -2285,7 +2286,7 @@ incomingCalls(const CallHierarchyItem &Item, const SymbolIndex *Index) {
       elog("incomingCalls failed to convert location: {0}", Loc.takeError());
       return;
     }
-    CallsIn[R.Container].push_back(Loc->range);
+    CallsIn[R.Container].push_back(*Loc);
 
     ContainerLookup.IDs.insert(R.Container);
   });
@@ -2294,9 +2295,21 @@ incomingCalls(const CallHierarchyItem &Item, const SymbolIndex *Index) {
   Index->lookup(ContainerLookup, [&](const Symbol &Caller) {
     auto It = CallsIn.find(Caller.ID);
     assert(It != CallsIn.end());
-    if (auto CHI = symbolToCallHierarchyItem(Caller, Item.uri.file()))
+    if (auto CHI = symbolToCallHierarchyItem(Caller, Item.uri.file())) {
+      std::vector<Range> FromRanges;
+      for (const Location &L : It->second) {
+        if (L.uri != CHI->uri) {
+          // Call location not in same file as caller.
+          // This can happen in some edge cases. There's not much we can do,
+          // since the protocol only allows returning ranges interpreted as
+          // being in the caller's file.
+          continue;
+        }
+        FromRanges.push_back(L.range);
+      }
       Results.push_back(
-          CallHierarchyIncomingCall{std::move(*CHI), std::move(It->second)});
+          CallHierarchyIncomingCall{std::move(*CHI), std::move(FromRanges)});
+    }
   });
   // Sort results by name of container.
   llvm::sort(Results, [](const CallHierarchyIncomingCall &A,
diff --git a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
index 591a8b245260e..789c10bdd4822 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
@@ -109,14 +109,13 @@ findContextForNS(llvm::StringRef TargetNS, const DeclContext *CurContext) {
 // afterwards it can be shared with define-inline code action.
 llvm::Expected<std::string>
 getFunctionSourceAfterReplacements(const FunctionDecl *FD,
-                                   const tooling::Replacements &Replacements) {
+                                   const tooling::Replacements &Replacements,
+                                   bool TargetFileIsHeader) {
   const auto &SM = FD->getASTContext().getSourceManager();
   auto OrigFuncRange = toHalfOpenFileRange(
       SM, FD->getASTContext().getLangOpts(), FD->getSourceRange());
   if (!OrigFuncRange)
     return error("Couldn't get range for function.");
-  assert(!FD->getDescribedFunctionTemplate() &&
-         "Define out-of-line doesn't apply to function templates.");
 
   // Get new begin and end positions for the qualified function definition.
   unsigned FuncBegin = SM.getFileOffset(OrigFuncRange->getBegin());
@@ -129,24 +128,38 @@ getFunctionSourceAfterReplacements(const FunctionDecl *FD,
   if (!QualifiedFunc)
     return QualifiedFunc.takeError();
 
+  auto Source = QualifiedFunc->substr(FuncBegin, FuncEnd - FuncBegin + 1);
   std::string TemplatePrefix;
+  auto AddToTemplatePrefixIfApplicable = [&](const Decl *D) {
+    const TemplateParameterList *Params = D->getDescribedTemplateParams();
+    if (!Params)
+      return;
+    for (Decl *P : *Params) {
+      if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(P))
+        TTP->removeDefaultArgument();
+      else if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(P))
+        NTTP->removeDefaultArgument();
+      else if (auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(P))
+        TTPD->removeDefaultArgument();
+    }
+    std::string S;
+    llvm::raw_string_ostream Stream(S);
+    Params->print(Stream, FD->getASTContext());
+    if (!S.empty())
+      *S.rbegin() = '\n'; // Replace space with newline
+    TemplatePrefix.insert(0, S);
+  };
+  AddToTemplatePrefixIfApplicable(FD);
   if (auto *MD = llvm::dyn_cast<CXXMethodDecl>(FD)) {
     for (const CXXRecordDecl *Parent = MD->getParent(); Parent;
          Parent =
              llvm::dyn_cast_or_null<const CXXRecordDecl>(Parent->getParent())) {
-      if (const TemplateParameterList *Params =
-              Parent->getDescribedTemplateParams()) {
-        std::string S;
-        llvm::raw_string_ostream Stream(S);
-        Params->print(Stream, FD->getASTContext());
-        if (!S.empty())
-          *S.rbegin() = '\n'; // Replace space with newline
-        TemplatePrefix.insert(0, S);
-      }
+      AddToTemplatePrefixIfApplicable(Parent);
     }
   }
 
-  auto Source = QualifiedFunc->substr(FuncBegin, FuncEnd - FuncBegin + 1);
+  if (TargetFileIsHeader)
+    Source.insert(0, "inline ");
   if (!TemplatePrefix.empty())
     Source.insert(0, TemplatePrefix);
   return Source;
@@ -202,7 +215,8 @@ deleteTokensWithKind(const syntax::TokenBuffer &TokBuf, tok::TokenKind Kind,
 llvm::Expected<std::string>
 getFunctionSourceCode(const FunctionDecl *FD, const DeclContext *TargetContext,
                       const syntax::TokenBuffer &TokBuf,
-                      const HeuristicResolver *Resolver) {
+                      const HeuristicResolver *Resolver,
+                      bool TargetFileIsHeader) {
   auto &AST = FD->getASTContext();
   auto &SM = AST.getSourceManager();
 
@@ -225,6 +239,8 @@ getFunctionSourceCode(const FunctionDecl *FD, const DeclContext *TargetContext,
           return;
 
         for (const NamedDecl *ND : Ref.Targets) {
+          if (ND->getKind() == Decl::TemplateTypeParm)
+            return;
           if (ND->getDeclContext() != Ref.Targets.front()->getDeclContext()) {
             elog("Targets from multiple contexts: {0}, {1}",
                  printQualifiedName(*Ref.Targets.front()),
@@ -337,7 +353,8 @@ getFunctionSourceCode(const FunctionDecl *FD, const DeclContext *TargetContext,
 
   if (Errors)
     return std::move(Errors);
-  return getFunctionSourceAfterReplacements(FD, DeclarationCleanups);
+  return getFunctionSourceAfterReplacements(FD, DeclarationCleanups,
+                                            TargetFileIsHeader);
 }
 
 struct InsertionPoint {
@@ -419,15 +436,15 @@ class DefineOutline : public Tweak {
         Source->isOutOfLine())
       return false;
 
-    // Bail out if this is a function template or specialization, as their
+    // Bail out if this is a function template specialization, as their
     // definitions need to be visible in all including translation units.
-    if (Source->getDescribedFunctionTemplate())
-      return false;
     if (Source->getTemplateSpecializationInfo())
       return false;
 
     auto *MD = llvm::dyn_cast<CXXMethodDecl>(Source);
     if (!MD) {
+      if (Source->getDescribedFunctionTemplate())
+        return false;
       // Can't outline free-standing functions in the same file.
       return !SameFile;
     }
@@ -450,6 +467,19 @@ class DefineOutline : public Tweak {
       }
     }
 
+    // For function templates, the same limitations as for class templates
+    // apply.
+    if (const TemplateParameterList *Params =
+            MD->getDescribedTemplateParams()) {
+      // FIXME: Is this really needed? It inhibits application on
+      //        e.g. std::enable_if.
+      for (NamedDecl *P : *Params) {
+        if (!P->getIdentifier())
+          return false;
+      }
+      SameFile = true;
+    }
+
     // The refactoring is meaningless for unnamed classes and namespaces,
     // unless we're outlining in the same file
     for (const DeclContext *DC = MD->getParent(); DC; DC = DC->getParent()) {
@@ -485,7 +515,8 @@ class DefineOutline : public Tweak {
 
     auto FuncDef = getFunctionSourceCode(
         Source, InsertionPoint->EnclosingNamespace, Sel.AST->getTokens(),
-        Sel.AST->getHeuristicResolver());
+        Sel.AST->getHeuristicResolver(),
+        SameFile && isHeaderFile(Sel.AST->tuPath(), Sel.AST->getLangOpts()));
     if (!FuncDef)
       return FuncDef.takeError();
 
diff --git a/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp b/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
index b2278ff12735d..8821d3aad9c78 100644
--- a/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
@@ -491,6 +491,35 @@ TEST(CallHierarchy, HierarchyOnVar) {
                                 fromRanges(Source.range("Callee")))));
 }
 
+TEST(CallHierarchy, CallInDifferentFileThanCaller) {
+  Annotations Header(R"cpp(
+    #define WALDO void caller() {
+  )cpp");
+  Annotations Source(R"cpp(
+    void call^ee();
+    WALDO
+      callee();
+    }
+  )cpp");
+  auto TU = TestTU::withCode(Source.code());
+  TU.HeaderCode = Header.code();
+  auto AST = TU.build();
+  auto Index = TU.index();
+
+  std::vector<CallHierarchyItem> Items =
+      prepareCallHierarchy(AST, Source.point(), testPath(TU.Filename));
+  ASSERT_THAT(Items, ElementsAre(withName("callee")));
+
+  auto Incoming = incomingCalls(Items[0], Index.get());
+
+  // The only call site is in the source file, which is a different file from
+  // the declaration of the function containing the call, which is in the
+  // header. The protocol does not allow us to represent such calls, so we drop
+  // them. (The call hierarchy item itself is kept.)
+  EXPECT_THAT(Incoming,
+              ElementsAre(AllOf(from(withName("caller")), fromRanges())));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/tweaks/DefineOutlineTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/DefineOutlineTests.cpp
index 6a9e90c3bfa70..d2d2ae9e7bb61 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/DefineOutlineTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/DefineOutlineTests.cpp
@@ -111,11 +111,17 @@ TEST_F(DefineOutlineTest, TriggersOnFunctionDecl) {
     template <typename> struct Foo { void fo^o(){} };
     )cpp");
 
-  // Not available on function templates and specializations, as definition must
-  // be visible to all translation units.
+  // Not available on function template specializations and free function
+  // templates.
   EXPECT_UNAVAILABLE(R"cpp(
-    template <typename> void fo^o() {};
-    template <> void fo^o<int>() {};
+    template <typename T> void fo^o() {}
+    template <> void fo^o<int>() {}
+  )cpp");
+
+  // Not available on member function templates with unnamed template
+  // parameters.
+  EXPECT_UNAVAILABLE(R"cpp(
+    struct Foo { template <typename> void ba^r() {} };
   )cpp");
 
   // Not available on methods of unnamed classes.
@@ -237,7 +243,7 @@ TEST_F(DefineOutlineTest, ApplyTest) {
                 Foo(T z) __attribute__((weak)) ;
                 int bar;
               };template <typename T>
-Foo<T>::Foo(T z) __attribute__((weak)) : bar(2){}
+inline Foo<T>::Foo(T z) __attribute__((weak)) : bar(2){}
 )cpp",
           ""},
       // Virt specifiers.
@@ -390,7 +396,7 @@ Foo<T>::Foo(T z) __attribute__((weak)) : bar(2){}
               };
             };template <typename T, typename ...U>
 template <class V, int A>
-typename O1<T, U...>::template O2<V, A>::E O1<T, U...>::template O2<V, A>::I::foo(T, U..., V, E) { return E1; }
+inline typename O1<T, U...>::template O2<V, A>::E O1<T, U...>::template O2<V, A>::I::foo(T, U..., V, E) { return E1; }
 )cpp",
           ""},
       // Destructors
@@ -399,6 +405,37 @@ typename O1<T, U...>::template O2<V, A>::E O1<T, U...>::template O2<V, A>::I::fo
           "class A { ~A(); };",
           "A::~A(){} ",
       },
+
+      // Member template
+      {
+          R"cpp(
+            struct Foo {
+              template <typename T, bool B = true>
+              T ^bar() { return {}; }
+            };)cpp",
+          R"cpp(
+            struct Foo {
+              template <typename T, bool B = true>
+              T bar() ;
+            };template <typename T, bool B>
+inline T Foo::bar() { return {}; }
+)cpp",
+          ""},
+
+      // Class template with member template
+      {
+          R"cpp(
+            template <typename T> struct Foo {
+              template <typename U> T ^bar(const T& t, const U& u) { return {}; }
+            };)cpp",
+          R"cpp(
+            template <typename T> struct Foo {
+              template <typename U> T bar(const T& t, const U& u) ;
+            };template <typename T>
+template <typename U>
+inline T Foo<T>::bar(const T& t, const U& u) { return {}; }
+)cpp",
+          ""},
   };
   for (const auto &Case : Cases) {
     SCOPED_TRACE(Case.Test);
diff --git a/clang-tools-extra/test/CMakeLists.txt b/clang-tools-extra/test/CMakeLists.txt
index d72a117166a08..7e4d99d8cfc1d 100644
--- a/clang-tools-extra/test/CMakeLists.txt
+++ b/clang-tools-extra/test/CMakeLists.txt
@@ -50,8 +50,6 @@ set(CLANG_TOOLS_TEST_DEPS
   clang-resource-headers
 
   clang-tidy
-  # Clang-tidy tests need clang for building modules.
-  clang
 )
 
 # Add lit test dependencies.
diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake
index 87118bbd33377..853217c6db61a 100644
--- a/clang/cmake/caches/CrossWinToARMLinux.cmake
+++ b/clang/cmake/caches/CrossWinToARMLinux.cmake
@@ -119,7 +119,6 @@ if (NOT DEFINED CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")
 endif()
 
-set(CMAKE_CROSSCOMPILING                    ON CACHE BOOL "")
 set(CMAKE_CL_SHOWINCLUDES_PREFIX            "Note: including file: " CACHE STRING "")
 # Required if COMPILER_RT_DEFAULT_TARGET_ONLY is ON
 set(CMAKE_C_COMPILER_TARGET                 "${TOOLCHAIN_TARGET_TRIPLE}" CACHE STRING "")
@@ -219,6 +218,11 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_CXX_ABI
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS      ON CACHE BOOL "")
 # Merge libc++ and libc++abi libraries into the single libc++ library file.
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_STATIC_ABI_LIBRARY          ON CACHE BOOL "")
+# Forcely disable the libc++ benchmarks on Windows build hosts
+# (current benchmark test configuration does not support the cross builds there).
+if (WIN32)
+  set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_INCLUDE_BENCHMARKS               OFF CACHE BOOL "")
+endif(WIN32)
 
 # Avoid searching for the python3 interpreter during the runtimes configuration for the cross builds.
 # It starts searching the python3 package using the target's sysroot path, that usually is not compatible with the build host.
diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index f189cb4e6a2ac..39d389b816f12 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -160,6 +160,10 @@ wording a diagnostic.
   named in a diagnostic message. e.g., prefer wording like ``'this' pointer
   cannot be null in well-defined C++ code`` over wording like ``this pointer
   cannot be null in well-defined C++ code``.
+* Prefer diagnostic wording without contractions whenever possible. The single
+  quote in a contraction can be visually distracting due to its use with
+  syntactic constructs and contractions can be harder to understand for non-
+  native English speakers.
 
 The Format String
 ^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2bd67138ecc04..999c88455b64a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -356,6 +356,8 @@ Non-comprehensive list of changes in this release
   issues with the sanitizer because the counter is automatically set.
 
 - ``__builtin_reduce_add`` function can now be used in constant expressions.
+- ``__builtin_reduce_mul`` function can now be used in constant expressions.
+- ``__builtin_reduce_and`` function can now be used in constant expressions.
 
 New Compiler Flags
 ------------------
@@ -537,6 +539,28 @@ Improvements to Clang's diagnostics
 
 - Improved diagnostic message for ``__builtin_bit_cast`` size mismatch (#GH115870).
 
+- Clang now omits shadow warnings for enum constants in separate class scopes (#GH62588).
+
+- When diagnosing an unused return value of a type declared ``[[nodiscard]]``, the type
+  itself is now included in the diagnostic.
+
+- Clang will now prefer the ``[[nodiscard]]`` declaration on function declarations over ``[[nodiscard]]``
+  declaration on the return type of a function. Previously, when both have a ``[[nodiscard]]`` declaration attached,
+  the one on the return type would be preferred. This may affect the generated warning message:
+
+  .. code-block:: c++
+
+    struct [[nodiscard("Reason 1")]] S {};
+    [[nodiscard("Reason 2")]] S getS();
+    void use()
+    {
+      getS(); // Now diagnoses "Reason 2", previously diagnoses "Reason 1"
+    }
+
+- Clang now diagnoses ``= delete("reason")`` extension warnings only in pedantic mode rather than on by default. (#GH109311).
+
+- Clang now diagnoses missing return value in functions containing ``if consteval`` (#GH116485).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -666,6 +690,8 @@ Bug Fixes to C++ Support
 - Name independent data members were not correctly initialized from default member initializers. (#GH114069)
 - Fixed expression transformation for ``[[assume(...)]]``, allowing using pack indexing expressions within the
   assumption if they also occur inside of a dependent lambda. (#GH114787)
+- Clang now uses valid deduced type locations when diagnosing functions with trailing return type
+  missing placeholder return type. (#GH78694)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -712,6 +738,8 @@ Target Specific Changes
 AMDGPU Support
 ^^^^^^^^^^^^^^
 
+- Initial support for gfx950
+
 - Added headers ``gpuintrin.h`` and ``amdgpuintrin.h`` that contains common
   definitions for GPU builtin functions. This header can be included for OpenMP,
   CUDA, HIP, OpenCL, and C/C++.
diff --git a/clang/docs/ThreadSafetyAnalysis.rst b/clang/docs/ThreadSafetyAnalysis.rst
index cc4089b97b492..f6517afc3bfc2 100644
--- a/clang/docs/ThreadSafetyAnalysis.rst
+++ b/clang/docs/ThreadSafetyAnalysis.rst
@@ -933,11 +933,25 @@ implementation.
     MutexLocker(Mutex *mu, defer_lock_t) EXCLUDES(mu) : mut(mu), locked(false) {}
 
     // Same as constructors, but without tag types. (Requires C++17 copy elision.)
-    static MutexLocker Lock(Mutex *mu) ACQUIRE(mu);
-    static MutexLocker Adopt(Mutex *mu) REQUIRES(mu);
-    static MutexLocker ReaderLock(Mutex *mu) ACQUIRE_SHARED(mu);
-    static MutexLocker AdoptReaderLock(Mutex *mu) REQUIRES_SHARED(mu);
-    static MutexLocker DeferLock(Mutex *mu) EXCLUDES(mu);
+    static MutexLocker Lock(Mutex *mu) ACQUIRE(mu) {
+      return MutexLocker(mu);
+    }
+
+    static MutexLocker Adopt(Mutex *mu) REQUIRES(mu) {
+      return MutexLocker(mu, adopt_lock);
+    }
+
+    static MutexLocker ReaderLock(Mutex *mu) ACQUIRE_SHARED(mu) {
+      return MutexLocker(mu, shared_lock);
+    }
+
+    static MutexLocker AdoptReaderLock(Mutex *mu) REQUIRES_SHARED(mu) {
+      return MutexLocker(mu, adopt_lock, shared_lock);
+    }
+
+    static MutexLocker DeferLock(Mutex *mu) EXCLUDES(mu) {
+      return MutexLocker(mu, defer_lock);
+    }
 
     // Release *this and all associated mutexes, if they are still held.
     // There is no warning if the scope was already unlocked before.
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 466c65a9685ad..708c8656decbe 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -3181,12 +3181,14 @@ class CallExpr : public Expr {
   QualType getCallReturnType(const ASTContext &Ctx) const;
 
   /// Returns the WarnUnusedResultAttr that is either declared on the called
-  /// function, or its return type declaration.
-  const Attr *getUnusedResultAttr(const ASTContext &Ctx) const;
+  /// function, or its return type declaration, together with a NamedDecl that
+  /// refers to the declaration the attribute is attached onto.
+  std::pair<const NamedDecl *, const Attr *>
+  getUnusedResultAttr(const ASTContext &Ctx) const;
 
   /// Returns true if this call expression should warn on unused results.
   bool hasUnusedResultAttr(const ASTContext &Ctx) const {
-    return getUnusedResultAttr(Ctx) != nullptr;
+    return getUnusedResultAttr(Ctx).second != nullptr;
   }
 
   SourceLocation getRParenLoc() const { return RParenLoc; }
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 6035a563d5fce..634253d003256 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4888,3 +4888,10 @@ def ClspvLibclcBuiltin: InheritableAttr {
   let Documentation = [ClspvLibclcBuiltinDoc];
   let SimpleHandler = 1;
 }
+
+def NoTrivialAutoVarInit: InheritableAttr {
+  let Spellings = [Declspec<"no_init_all">];
+  let Subjects = SubjectList<[Function, Tag]>;
+  let Documentation = [NoTrivialAutoVarInitDocs];
+  let SimpleHandler = 1;
+}
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2fdceca163ee6..6fb2eb3eb3e66 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3921,17 +3921,42 @@ have their lifetimes extended.
 def LifetimeCaptureByDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
-    Similar to `lifetimebound`_, the ``lifetime_capture_by(X)`` attribute on a function
-parameter or implicit object parameter indicates that that objects that are referred to
-by that parameter may also be referred to by the capturing entity ``X``.
+Similar to `lifetimebound`_, the ``lifetime_capture_by(X)`` attribute on a
+function parameter or implicit object parameter indicates that the capturing
+entity ``X`` may refer to the object referred by that parameter.
+
+Below is a list of types of the parameters and what they're considered to refer to:
+
+- A reference param (of non-view type) is considered to refer to its referenced object.
+- A pointer param (of non-view type) is considered to refer to its pointee.
+- View type param (type annotated with ``[[gsl::Pointer()]]``) is considered to refer
+  to its pointee (gsl owner). This holds true even if the view type appears as a reference
+  in the parameter. For example, both ``std::string_view`` and
+  ``const std::string_view &`` are considered to refer to a ``std::string``.
+- A ``std::initializer_list<T>`` is considered to refer to its underlying array.
+- Aggregates (arrays and simple ``struct``\s) are considered to refer to all
+  objects that their transitive subobjects refer to.
+
+Clang would diagnose when a temporary object is used as an argument to such an
+annotated parameter.
+In this case, the capturing entity ``X`` could capture a dangling reference to this
+temporary object.
 
-By default, a reference is considered to refer to its referenced object, a
-pointer is considered to refer to its pointee, a ``std::initializer_list<T>``
-is considered to refer to its underlying array, and aggregates (arrays and
-simple ``struct``\s) are considered to refer to all objects that their
-transitive subobjects refer to.
+.. code-block:: c++
+  
+  void addToSet(std::string_view a [[clang::lifetime_capture_by(s)]], std::set<std::string_view>& s) {
+    s.insert(a);
+  }
+  void use() {
+    std::set<std::string_view> s;
+    addToSet(std::string(), s); // Warning: object whose reference is captured by 's' will be destroyed at the end of the full-expression.
+    //       ^^^^^^^^^^^^^
+    std::string local;
+    addToSet(local, s); // Ok.
+  }
 
 The capturing entity ``X`` can be one of the following:
+
 - Another (named) function parameter. 
   
   .. code-block:: c++
@@ -3951,7 +3976,7 @@ The capturing entity ``X`` can be one of the following:
       std::set<std::string_view> s;
     };
 
-- 'global', 'unknown' (without quotes).
+- `global`, `unknown`.
   
   .. code-block:: c++
 
@@ -3983,6 +4008,22 @@ The attribute supports specifying more than one capturing entities:
     s2.insert(a);
   }
 
+Limitation: The capturing entity ``X`` is not used by the analysis and is
+used for documentation purposes only. This is because the analysis is
+statement-local and only detects use of a temporary as an argument to the
+annotated parameter.
+
+.. code-block:: c++
+  
+  void addToSet(std::string_view a [[clang::lifetime_capture_by(s)]], std::set<std::string_view>& s);
+  void use() {
+    std::set<std::string_view> s;
+    if (foo()) {
+      std::string str;
+      addToSet(str, s); // Not detected.
+    }
+  }
+
 .. _`lifetimebound`: https://clang.llvm.org/docs/AttributeReference.html#lifetimebound
   }];
 }
@@ -8719,6 +8760,18 @@ Attribute used by `clspv`_ (OpenCL-C to Vulkan SPIR-V compiler) to identify func
 }];
 }
 
+def NoTrivialAutoVarInitDocs : Documentation {
+  let Category = DocCatDecl;
+  let Content = [{
+The ``__declspec(no_init_all)`` attribute disables the automatic initialization that the
+`-ftrivial-auto-var-init`_ flag would have applied to locals in a marked function, or instances of
+a marked type. Note that this attribute has no effect for locals that are automatically initialized
+without the `-ftrivial-auto-var-init`_ flag.
+
+.. _`-ftrivial-auto-var-init`: ClangCommandLineReference.html#cmdoption-clang-ftrivial-auto-var-init
+}];
+}
+
 def DocCatNonBlockingNonAllocating : DocumentationCategory<"Performance Constraint Attributes"> {
   let Content = [{
 The ``nonblocking``, ``blocking``, ``nonallocating`` and ``allocating`` attributes can be attached
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index f5124f4633364..aa65f94e68f9c 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1498,7 +1498,7 @@ def ReduceOr : Builtin {
 
 def ReduceAnd : Builtin {
   let Spellings = ["__builtin_reduce_and"];
-  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
   let Prototype = "void(...)";
 }
 
@@ -1510,7 +1510,7 @@ def ReduceAdd : Builtin {
 
 def ReduceMul : Builtin {
   let Spellings = ["__builtin_reduce_mul"];
-  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
   let Prototype = "void(...)";
 }
 
@@ -1995,6 +1995,12 @@ def AtomicThreadFence : Builtin {
   let Prototype = "void(int)";
 }
 
+def ScopedAtomicThreadFence : Builtin {
+  let Spellings = ["__scoped_atomic_thread_fence"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(int, int)";
+}
+
 def AtomicSignalFence : Builtin {
   let Spellings = ["__atomic_signal_fence"];
   let Attributes = [NoThrow];
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8f44afa405938..7ce8f2c1669d6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -431,6 +431,14 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-conversion-
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-conversion-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-insts")
 
+//===----------------------------------------------------------------------===//
+// GFX950 only builtins.
+//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
+
+TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
+
 //===----------------------------------------------------------------------===//
 // GFX12+ only builtins.
 //===----------------------------------------------------------------------===//
@@ -522,5 +530,7 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs",
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
 
+TARGET_BUILTIN(__builtin_amdgcn_prng_b32, "UiUi", "nc", "prng-inst")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
index f644b820a6189..c4ea46a3bc5b5 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
@@ -371,7 +371,7 @@ TARGET_BUILTIN(__builtin_lasx_xvor_v, "V32UcV32UcV32Uc", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvxor_v, "V32UcV32UcV32Uc", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvnor_v, "V32UcV32UcV32Uc", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvandn_v, "V32UcV32UcV32Uc", "nc", "lasx")
-TARGET_BUILTIN(__builtin_lasx_xvorn_v, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvorn_v, "V32UcV32UcV32Uc", "nc", "lasx")
 
 TARGET_BUILTIN(__builtin_lasx_xvandi_b, "V32UcV32UcIUi", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvori_b, "V32UcV32UcIUi", "nc", "lasx")
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
index b3056971986d1..a823783af88c4 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
@@ -355,7 +355,7 @@ TARGET_BUILTIN(__builtin_lsx_vor_v, "V16UcV16UcV16Uc", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vxor_v, "V16UcV16UcV16Uc", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vnor_v, "V16UcV16UcV16Uc", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vandn_v, "V16UcV16UcV16Uc", "nc", "lsx")
-TARGET_BUILTIN(__builtin_lsx_vorn_v, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vorn_v, "V16UcV16UcV16Uc", "nc", "lsx")
 
 TARGET_BUILTIN(__builtin_lsx_vandi_b, "V16UcV16UcIUi", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vori_b, "V16UcV16UcIUi", "nc", "lsx")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 721e8981af6ff..c2a4addf488df 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -107,6 +107,7 @@ enum class OffloadArch {
   GFX940,
   GFX941,
   GFX942,
+  GFX950,
   GFX10_1_GENERIC,
   GFX1010,
   GFX1011,
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index 0c131166aff28..f4a155bb00bb3 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -364,9 +364,9 @@ def err_target_unsupported_abi_with_fpu : Error<
 
 def err_ppc_impossible_musttail: Error<
   "'musttail' attribute for this call is impossible because %select{"
-  "long calls can not be tail called on PPC|"
-  "indirect calls can not be tail called on PPC|"
-  "external calls can not be tail called on PPC}0"
+  "long calls cannot be tail called on PPC|"
+  "indirect calls cannot be tail called on PPC|"
+  "external calls cannot be tail called on PPC}0"
   >;
 def err_aix_musttail_unsupported: Error<
   "'musttail' attribute is not supported on AIX">;
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 76fdbdbfb01d9..5155b23d151c0 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -553,7 +553,7 @@ def err_test_module_file_extension_format : Error<
   "'blockname:major:minor:hashed:user info'">;
 
 def err_drv_module_output_with_multiple_arch : Error<
-  "option '-fmodule-output' can't be used with multiple arch options">;
+  "option '-fmodule-output' cannot be used with multiple arch options">;
 
 def warn_drv_delayed_template_parsing_after_cxx20 : Warning<
   "-fdelayed-template-parsing is deprecated after C++20">,
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 72eada50a56cc..df9bf94b5d039 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -453,6 +453,7 @@ def ShiftOpParentheses: DiagGroup<"shift-op-parentheses">;
 def OverloadedShiftOpParentheses: DiagGroup<"overloaded-shift-op-parentheses">;
 def DanglingAssignment: DiagGroup<"dangling-assignment">;
 def DanglingAssignmentGsl : DiagGroup<"dangling-assignment-gsl">;
+def DanglingCapture : DiagGroup<"dangling-capture">;
 def DanglingElse: DiagGroup<"dangling-else">;
 def DanglingField : DiagGroup<"dangling-field">;
 def DanglingInitializerList : DiagGroup<"dangling-initializer-list">;
@@ -462,6 +463,7 @@ def ReturnStackAddress : DiagGroup<"return-stack-address">;
 def : DiagGroup<"return-local-addr", [ReturnStackAddress]>;
 def Dangling : DiagGroup<"dangling", [DanglingAssignment,
                                       DanglingAssignmentGsl,
+                                      DanglingCapture,
                                       DanglingField,
                                       DanglingInitializerList,
                                       DanglingGsl,
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 0da509280068a..77bf08453dea5 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -976,7 +976,7 @@ def warn_cxx98_compat_defaulted_deleted_function : Warning<
   "%select{defaulted|deleted}0 function definitions are incompatible with C++98">,
   InGroup<CXX98Compat>, DefaultIgnore;
 
-def ext_delete_with_message : ExtWarn<
+def ext_delete_with_message : Extension<
   "'= delete' with a message is a C++2c extension">, InGroup<CXX26>;
 def warn_cxx23_delete_with_message : Warning<
   "'= delete' with a message is incompatible with C++ standards before C++2c">,
diff --git a/clang/include/clang/Basic/DiagnosticRefactoringKinds.td b/clang/include/clang/Basic/DiagnosticRefactoringKinds.td
index 5446b32efbdd4..e060fffc7280a 100644
--- a/clang/include/clang/Basic/DiagnosticRefactoringKinds.td
+++ b/clang/include/clang/Basic/DiagnosticRefactoringKinds.td
@@ -14,7 +14,7 @@ let Component = "Refactoring" in {
 
 let CategoryName = "Refactoring Invocation Issue" in {
 
-def err_refactor_no_selection : Error<"refactoring action can't be initiated "
+def err_refactor_no_selection : Error<"refactoring action cannot be initiated "
   "without a selection">;
 def err_refactor_selection_no_symbol : Error<"there is no symbol at the given "
   "location">;
@@ -26,7 +26,7 @@ def err_refactor_code_outside_of_function : Error<"the selected code is not a "
 def err_refactor_extract_simple_expression : Error<"the selected expression "
   "is too simple to extract">;
 def err_refactor_extract_prohibited_expression : Error<"the selected "
-  "expression can't be extracted">;
+  "expression cannot be extracted">;
 
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 17eb28e8fc562..157d77b38b354 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1151,7 +1151,7 @@ def err_pragma_attribute_matcher_subrule_contradicts_rule : Error<
 def err_pragma_attribute_matcher_negated_subrule_contradicts_subrule : Error<
   "negated attribute subject matcher sub-rule '%0' contradicts sub-rule '%1'">;
 def err_pragma_attribute_invalid_matchers : Error<
-  "attribute %0 can't be applied to %1">;
+  "attribute %0 cannot be applied to %1">;
 def err_pragma_attribute_stack_mismatch : Error<
   "'#pragma clang attribute %select{%1.|}0pop' with no matching"
   " '#pragma clang attribute %select{%1.|}0push'">;
@@ -6150,7 +6150,7 @@ def err_mismatched_owning_module : Error<
   "declaration of %0 in %select{the global module|module %2}1 follows "
   "declaration in %select{the global module|module %4}3">;
 def err_multiple_decl_in_different_modules : Error<
-  "declaration %0 attached to named module '%1' can't be attached to "
+  "declaration %0 attached to named module '%1' cannot be attached to "
   "other modules">;
 def err_redefinition_different_type : Error<
   "redefinition of %0 with a different type%diff{: $ vs $|}1,2">;
@@ -8560,7 +8560,7 @@ def err_typecheck_missing_return_type_incompatible : Error<
   "literal|lambda expression}2 has unspecified explicit return type">;
 
 def note_incomplete_class_and_qualified_id : Note<
-  "conformance of forward class %0 to protocol %1 can not be confirmed">;
+  "conformance of forward class %0 to protocol %1 cannot be confirmed">;
 def warn_incompatible_qualified_id : Warning<
   "%select{%diff{assigning to $ from incompatible type $|"
   "assigning to type from incompatible type}0,1"
@@ -9300,11 +9300,11 @@ def warn_unused_container_subscript_expr : Warning<
 def warn_unused_call : Warning<
   "ignoring return value of function declared with %0 attribute">,
   InGroup<UnusedValue>;
-def warn_unused_constructor : Warning<
-  "ignoring temporary created by a constructor declared with %0 attribute">,
+def warn_unused_return_type : Warning<
+  "ignoring %select{return value|temporary}0 of type %2 declared with %1 attribute%select{|: %4}3">,
   InGroup<UnusedValue>;
-def warn_unused_constructor_msg : Warning<
-  "ignoring temporary created by a constructor declared with %0 attribute: %1">,
+def warn_unused_constructor : Warning<
+  "ignoring temporary created by a constructor declared with %0 attribute%select{|: %2}1">,
   InGroup<UnusedValue>;
 def warn_side_effects_unevaluated_context : Warning<
   "expression with side effects has no effect in an unevaluated context">,
@@ -9313,10 +9313,7 @@ def warn_side_effects_typeid : Warning<
   "expression with side effects will be evaluated despite being used as an "
   "operand to 'typeid'">, InGroup<PotentiallyEvaluatedExpression>;
 def warn_unused_result : Warning<
-  "ignoring return value of function declared with %0 attribute">,
-  InGroup<UnusedResult>;
-def warn_unused_result_msg : Warning<
-  "ignoring return value of function declared with %0 attribute: %1">,
+  "ignoring return value of function declared with %0 attribute%select{|: %2}1">,
   InGroup<UnusedResult>;
 def warn_unused_result_typedef_unsupported_spelling : Warning<
   "'[[%select{nodiscard|gnu::warn_unused_result}0]]' attribute ignored when "
@@ -9414,7 +9411,7 @@ let CategoryName = "Inline Assembly Issue" in {
     "asm constraint has an unexpected number of alternatives: %0 vs %1">;
   def err_asm_incomplete_type : Error<"asm operand has incomplete type %0">;
   def err_asm_unknown_register_name : Error<"unknown register name '%0' in asm">;
-  def err_asm_unwind_and_goto : Error<"unwind clobber can't be used with asm goto">;
+  def err_asm_unwind_and_goto : Error<"unwind clobber cannot be used with asm goto">;
   def err_asm_invalid_global_var_reg : Error<"register '%0' unsuitable for "
     "global register variables on this target">;
   def err_asm_register_size_mismatch : Error<"size of register '%0' does not "
@@ -9433,7 +9430,7 @@ let CategoryName = "Inline Assembly Issue" in {
   def err_asm_input_duplicate_match : Error<
     "more than one input constraint matches the same output '%0'">;
   def err_store_value_to_reg : Error<
-    "impossible constraint in asm: can't store value into a register">;
+    "impossible constraint in asm: cannot store value into a register">;
 
   def warn_asm_label_on_auto_decl : Warning<
     "ignored asm label '%0' on automatic variable">;
@@ -10132,10 +10129,11 @@ def err_lifetimebound_ctor_dtor : Error<
   "%select{constructor|destructor}0">;
 def err_lifetimebound_parameter_void_return_type : Error<
   "'lifetimebound' attribute cannot be applied to a parameter of a function "
-  "that returns void">;
+  "that returns void; did you mean 'lifetime_capture_by(X)'">;
 def err_lifetimebound_implicit_object_parameter_void_return_type : Error<
   "'lifetimebound' attribute cannot be applied to an implicit object "
-  "parameter of a function that returns void">;
+  "parameter of a function that returns void; "
+  "did you mean 'lifetime_capture_by(X)'">;
 
 // CHECK: returning address/reference of stack memory
 def warn_ret_stack_addr_ref : Warning<
@@ -10230,6 +10228,12 @@ def warn_dangling_pointer_assignment : Warning<
    "object backing %select{|the pointer }0%1 "
    "will be destroyed at the end of the full-expression">,
    InGroup<DanglingAssignment>;
+def warn_dangling_reference_captured : Warning<
+   "object whose reference is captured by '%0' will be destroyed at the end of "
+   "the full-expression">, InGroup<DanglingCapture>, DefaultIgnore;
+def warn_dangling_reference_captured_by_unknown : Warning<
+   "object whose reference is captured will be destroyed at the end of "
+   "the full-expression">, InGroup<DanglingCapture>, DefaultIgnore;
 
 // For non-floating point, expressions of the form x == x or x != x
 // should result in a warning, since these always evaluate to a constant.
@@ -10960,7 +10964,7 @@ def err_opencl_builtin_pipe_invalid_access_modifier : Error<
 def err_opencl_invalid_access_qualifier : Error<
   "access qualifier can only be used for pipe and image type">;
 def err_opencl_invalid_read_write : Error<
-  "access qualifier %0 can not be used for %1 %select{|prior to OpenCL C version 2.0 or in version 3.0 "
+  "access qualifier %0 cannot be used for %1 %select{|prior to OpenCL C version 2.0 or in version 3.0 "
   "and without __opencl_c_read_write_images feature}2">;
 def err_opencl_multiple_access_qualifiers : Error<
   "multiple access qualifiers">;
@@ -11460,7 +11464,7 @@ def err_omp_wrong_linear_modifier : Error<
 def err_omp_wrong_linear_modifier_non_reference : Error<
   "variable of non-reference type %0 can be used only with 'val' modifier, but used with '%1'">;
 def err_omp_step_simple_modifier_exclusive : Error<
-  "step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier">;
+  "step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier">;
 def err_omp_wrong_simdlen_safelen_values : Error<
   "the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter">;
 def err_omp_wrong_if_directive_name_modifier : Error<
@@ -11534,7 +11538,7 @@ def err_omp_schedule_nonmonotonic_static : Error<
 def err_omp_simple_clause_incompatible_with_ordered : Error<
   "'%0' clause with '%1' modifier cannot be specified if an 'ordered' clause is specified">;
 def err_omp_ordered_simd : Error<
-  "'ordered' clause with a parameter can not be specified in '#pragma omp %0' directive">;
+  "'ordered' clause with a parameter cannot be specified in '#pragma omp %0' directive">;
 def err_omp_variable_in_given_clause_and_dsa : Error<
   "%0 variable cannot be in a %1 clause in '#pragma omp %2' directive">;
 def err_omp_param_or_this_in_clause : Error<
@@ -12366,7 +12370,7 @@ def err_preserve_enum_value_not_const: Error<
   "__builtin_preserve_enum_value argument %0 not a constant">;
 
 def err_bit_cast_non_trivially_copyable : Error<
-  "__builtin_bit_cast %select{source|destination}0 type must be trivially copyable">;
+  "'__builtin_bit_cast' %select{source|destination}0 type must be trivially copyable">;
 def err_bit_cast_type_size_mismatch : Error<
   "size of '__builtin_bit_cast' source type %0 does not match destination type %1 (%2 vs %3 bytes)">;
 
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 7f5d26118bdc7..9088c867d53ce 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -116,6 +116,7 @@ FEATURE(ptrauth_function_pointer_type_discrimination, LangOpts.PointerAuthFuncti
 FEATURE(ptrauth_indirect_gotos, LangOpts.PointerAuthIndirectGotos)
 FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
 FEATURE(ptrauth_init_fini_address_discrimination, LangOpts.PointerAuthInitFiniAddressDiscrimination)
+FEATURE(ptrauth_elf_got, LangOpts.PointerAuthELFGOT)
 EXTENSION(swiftcc,
   PP.getTargetInfo().checkCallingConvention(CC_Swift) ==
   clang::TargetInfo::CCCR_OK)
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 019ca25bb57ff..9cd23d123f2ba 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1871,6 +1871,11 @@ class TargetInfo : public TransferrableTargetInfo,
   void CheckFixedPointBits() const;
 };
 
+namespace targets {
+std::unique_ptr<clang::TargetInfo>
+AllocateTarget(const llvm::Triple &Triple, const clang::TargetOptions &Opts);
+} // namespace targets
+
 }  // end namespace clang
 
 #endif
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 1debb94a0a7b8..93abbc47c54dd 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -753,8 +753,8 @@ let params = T.Float in {
   defm: compare<"ne", fcmp_ne>;
   defm: compare<"gt", fcmp_gt>;
   defm: compare<"ge", fcmp_ge>;
-  defm: compare<"lt", fcmp_lt>;
-  defm: compare<"le", fcmp_le>;
+  defm: compare<"lt", fcmp_ult>;
+  defm: compare<"le", fcmp_ule>;
 }
 
 let params = T.Signed in {
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 1a090c08cc853..634b4e9c2c9c8 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -116,8 +116,8 @@ def fcmp_eq: IRBuilder<"CreateFCmpOEQ">;
 def fcmp_ne: IRBuilder<"CreateFCmpUNE">; // not O: it must return true on NaNs
 def fcmp_gt: IRBuilder<"CreateFCmpOGT">;
 def fcmp_ge: IRBuilder<"CreateFCmpOGE">;
-def fcmp_lt: IRBuilder<"CreateFCmpOLT">;
-def fcmp_le: IRBuilder<"CreateFCmpOLE">;
+def fcmp_ult: IRBuilder<"CreateFCmpULT">;
+def fcmp_ule: IRBuilder<"CreateFCmpULE">;
 def splat: CGHelperFn<"ARMMVEVectorSplat">;
 def select: IRBuilder<"CreateSelect">;
 def fneg: IRBuilder<"CreateFNeg">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d7230dd7272fd..5167c3c39e315 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1732,8 +1732,6 @@ defm gnu_inline_asm : BoolFOption<"gnu-inline-asm",
           "Disable GNU style inline asm">,
   PosFlag<SetTrue>>;
 
-def fprofile_sample_use : Flag<["-"], "fprofile-sample-use">, Group<f_Group>,
-    Visibility<[ClangOption, CLOption]>;
 def fno_profile_sample_use : Flag<["-"], "fno-profile-sample-use">, Group<f_Group>,
     Visibility<[ClangOption, CLOption]>;
 def fprofile_sample_use_EQ : Joined<["-"], "fprofile-sample-use=">,
@@ -1759,8 +1757,6 @@ def fsample_profile_use_profi : Flag<["-"], "fsample-profile-use-profi">,
                basic block counts to branch probabilites to fix them by extended
                and re-engineered classic MCMF (min-cost max-flow) approach.}]>;
 def fno_profile_sample_accurate : Flag<["-"], "fno-profile-sample-accurate">, Group<f_Group>;
-def fauto_profile : Flag<["-"], "fauto-profile">, Group<f_Group>,
-    Alias<fprofile_sample_use>;
 def fno_auto_profile : Flag<["-"], "fno-auto-profile">, Group<f_Group>,
     Alias<fno_profile_sample_use>;
 def fauto_profile_EQ : Joined<["-"], "fauto-profile=">,
@@ -4356,6 +4352,7 @@ defm ptrauth_indirect_gotos : OptInCC1FFlag<"ptrauth-indirect-gotos",
 defm ptrauth_init_fini : OptInCC1FFlag<"ptrauth-init-fini", "Enable signing of function pointers in init/fini arrays">;
 defm ptrauth_init_fini_address_discrimination : OptInCC1FFlag<"ptrauth-init-fini-address-discrimination",
   "Enable address discrimination of function pointers in init/fini arrays">;
+defm ptrauth_elf_got : OptInCC1FFlag<"ptrauth-elf-got", "Enable authentication of pointers from GOT (ELF only)">;
 }
 
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
diff --git a/clang/include/clang/Frontend/FrontendAction.h b/clang/include/clang/Frontend/FrontendAction.h
index 039f6f247b6d8..718684a67771a 100644
--- a/clang/include/clang/Frontend/FrontendAction.h
+++ b/clang/include/clang/Frontend/FrontendAction.h
@@ -21,6 +21,7 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Frontend/ASTUnit.h"
+#include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendOptions.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
@@ -185,7 +186,12 @@ class FrontendAction {
   virtual bool usesPreprocessorOnly() const = 0;
 
   /// For AST-based actions, the kind of translation unit we're handling.
-  virtual TranslationUnitKind getTranslationUnitKind() { return TU_Complete; }
+  virtual TranslationUnitKind getTranslationUnitKind() {
+    // The ASTContext, if exists, knows the exact TUKind of the frondend.
+    if (Instance && Instance->hasASTContext())
+      return Instance->getASTContext().TUKind;
+    return TU_Complete;
+  }
 
   /// Does this action support use with PCH?
   virtual bool hasPCHSupport() const { return true; }
diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h
index 1230a3a7016fa..b1b63aedf86ab 100644
--- a/clang/include/clang/Interpreter/Interpreter.h
+++ b/clang/include/clang/Interpreter/Interpreter.h
@@ -177,7 +177,8 @@ class Interpreter {
 
   CodeGenerator *getCodeGen() const;
   std::unique_ptr<llvm::Module> GenModule();
-  PartialTranslationUnit &RegisterPTU(TranslationUnitDecl *TU);
+  PartialTranslationUnit &RegisterPTU(TranslationUnitDecl *TU,
+                                      std::unique_ptr<llvm::Module> M = {});
 
   // A cache for the compiled destructors used to for de-allocation of managed
   // clang::Values.
diff --git a/clang/include/clang/Interpreter/PartialTranslationUnit.h b/clang/include/clang/Interpreter/PartialTranslationUnit.h
index bf91d559452b8..c878e139fe70d 100644
--- a/clang/include/clang/Interpreter/PartialTranslationUnit.h
+++ b/clang/include/clang/Interpreter/PartialTranslationUnit.h
@@ -31,6 +31,9 @@ struct PartialTranslationUnit {
 
   /// The llvm IR produced for the input.
   std::unique_ptr<llvm::Module> TheModule;
+  bool operator==(const PartialTranslationUnit &other) {
+    return other.TUPart == TUPart && other.TheModule == TheModule;
+  }
 };
 } // namespace clang
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d6f3508a5243f..6ea6c67447b6f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2323,6 +2323,9 @@ class Sema final : public SemaBase {
   bool BuiltinVectorMath(CallExpr *TheCall, QualType &Res, bool FPOnly = false);
   bool BuiltinVectorToScalarMath(CallExpr *TheCall);
 
+  void checkLifetimeCaptureBy(FunctionDecl *FDecl, bool IsMemberFunction,
+                              const Expr *ThisArg, ArrayRef<const Expr *> Args);
+
   /// Handles the checks for format strings, non-POD arguments to vararg
   /// functions, NULL arguments passed to non-NULL parameters, diagnose_if
   /// attributes and AArch64 SME attributes.
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 8725d5455ec73..fd834c14ce790 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -44,7 +44,7 @@ namespace serialization {
 /// Version 4 of AST files also requires that the version control branch and
 /// revision match exactly, since there is no backward compatibility of
 /// AST files at this time.
-const unsigned VERSION_MAJOR = 33;
+const unsigned VERSION_MAJOR = 34;
 
 /// AST file minor version number supported by this version of
 /// Clang.
@@ -350,9 +350,8 @@ enum ControlRecordTypes {
   /// and information about the compiler used to build this AST file.
   METADATA = 1,
 
-  /// Record code for the list of other AST files imported by
-  /// this AST file.
-  IMPORTS,
+  /// Record code for another AST file imported by this AST file.
+  IMPORT,
 
   /// Record code for the original file that was used to
   /// generate the AST file, including both its file ID and its
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 9c274adc59a20..f739fe688c110 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2389,11 +2389,8 @@ class ASTReader
 
   // Read a string
   static std::string ReadString(const RecordDataImpl &Record, unsigned &Idx);
-
-  // Skip a string
-  static void SkipString(const RecordData &Record, unsigned &Idx) {
-    Idx += Record[Idx] + 1;
-  }
+  static StringRef ReadStringBlob(const RecordDataImpl &Record, unsigned &Idx,
+                                  StringRef &Blob);
 
   // Read a path
   std::string ReadPath(ModuleFile &F, const RecordData &Record, unsigned &Idx);
@@ -2401,11 +2398,8 @@ class ASTReader
   // Read a path
   std::string ReadPath(StringRef BaseDirectory, const RecordData &Record,
                        unsigned &Idx);
-
-  // Skip a path
-  static void SkipPath(const RecordData &Record, unsigned &Idx) {
-    SkipString(Record, Idx);
-  }
+  std::string ReadPathBlob(StringRef BaseDirectory, const RecordData &Record,
+                           unsigned &Idx, StringRef &Blob);
 
   /// Read a version tuple.
   static VersionTuple ReadVersionTuple(const RecordData &Record, unsigned &Idx);
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 161b2ef7c86a4..e418fdea44a0a 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -769,6 +769,8 @@ class ASTWriter : public ASTDeserializationListener,
 
   /// Add a string to the given record.
   void AddString(StringRef Str, RecordDataImpl &Record);
+  void AddStringBlob(StringRef Str, RecordDataImpl &Record,
+                     SmallVectorImpl<char> &Blob);
 
   /// Convert a path from this build process into one that is appropriate
   /// for emission in the module file.
@@ -776,6 +778,8 @@ class ASTWriter : public ASTDeserializationListener,
 
   /// Add a path to the given record.
   void AddPath(StringRef Path, RecordDataImpl &Record);
+  void AddPathBlob(StringRef Str, RecordDataImpl &Record,
+                   SmallVectorImpl<char> &Blob);
 
   /// Emit the current record with the given path as a blob.
   void EmitRecordWithPath(unsigned Abbrev, RecordDataRef Record,
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 8aab8d2d2ce93..a4fb4d5a1f2ec 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1615,22 +1615,24 @@ QualType CallExpr::getCallReturnType(const ASTContext &Ctx) const {
   return FnType->getReturnType();
 }
 
-const Attr *CallExpr::getUnusedResultAttr(const ASTContext &Ctx) const {
+std::pair<const NamedDecl *, const Attr *>
+CallExpr::getUnusedResultAttr(const ASTContext &Ctx) const {
+  // If the callee is marked nodiscard, return that attribute
+  const Decl *D = getCalleeDecl();
+  if (const auto *A = D->getAttr<WarnUnusedResultAttr>())
+    return {nullptr, A};
+
   // If the return type is a struct, union, or enum that is marked nodiscard,
   // then return the return type attribute.
   if (const TagDecl *TD = getCallReturnType(Ctx)->getAsTagDecl())
     if (const auto *A = TD->getAttr<WarnUnusedResultAttr>())
-      return A;
+      return {TD, A};
 
   for (const auto *TD = getCallReturnType(Ctx)->getAs<TypedefType>(); TD;
        TD = TD->desugar()->getAs<TypedefType>())
     if (const auto *A = TD->getDecl()->getAttr<WarnUnusedResultAttr>())
-      return A;
-
-  // Otherwise, see if the callee is marked nodiscard and return that attribute
-  // instead.
-  const Decl *D = getCalleeDecl();
-  return D ? D->getAttr<WarnUnusedResultAttr>() : nullptr;
+      return {TD->getDecl(), A};
+  return {nullptr, nullptr};
 }
 
 SourceLocation CallExpr::getBeginLoc() const {
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 833b99bf1bd9f..33206f5cda202 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -13527,7 +13527,9 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     return Success(DidOverflow, E);
   }
 
-  case Builtin::BI__builtin_reduce_add: {
+  case Builtin::BI__builtin_reduce_add:
+  case Builtin::BI__builtin_reduce_mul:
+  case Builtin::BI__builtin_reduce_and: {
     APValue Source;
     if (!EvaluateAsRValue(Info, E->getArg(0), Source))
       return false;
@@ -13535,10 +13537,28 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     unsigned SourceLen = Source.getVectorLength();
     APSInt Reduced = Source.getVectorElt(0).getInt();
     for (unsigned EltNum = 1; EltNum < SourceLen; ++EltNum) {
-      if (!CheckedIntArithmetic(
-              Info, E, Reduced, Source.getVectorElt(EltNum).getInt(),
-              Reduced.getBitWidth() + 1, std::plus<APSInt>(), Reduced))
+      switch (BuiltinOp) {
+      default:
         return false;
+      case Builtin::BI__builtin_reduce_add: {
+        if (!CheckedIntArithmetic(
+                Info, E, Reduced, Source.getVectorElt(EltNum).getInt(),
+                Reduced.getBitWidth() + 1, std::plus<APSInt>(), Reduced))
+          return false;
+        break;
+      }
+      case Builtin::BI__builtin_reduce_mul: {
+        if (!CheckedIntArithmetic(
+                Info, E, Reduced, Source.getVectorElt(EltNum).getInt(),
+                Reduced.getBitWidth() * 2, std::multiplies<APSInt>(), Reduced))
+          return false;
+        break;
+      }
+      case Builtin::BI__builtin_reduce_and: {
+        Reduced &= Source.getVectorElt(EltNum).getInt();
+        break;
+      }
+      }
     }
 
     return Success(Reduced, E);
diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp
index f678ac6f2ff36..7a6bd8b6f8d07 100644
--- a/clang/lib/Analysis/CFG.cpp
+++ b/clang/lib/Analysis/CFG.cpp
@@ -3177,11 +3177,14 @@ CFGBlock *CFGBuilder::VisitIfStmt(IfStmt *I) {
     if (!I->isConsteval())
       KnownVal = tryEvaluateBool(I->getCond());
 
-    // Add the successors.  If we know that specific branches are
+    // Add the successors. If we know that specific branches are
     // unreachable, inform addSuccessor() of that knowledge.
     addSuccessor(Block, ThenBlock, /* IsReachable = */ !KnownVal.isFalse());
     addSuccessor(Block, ElseBlock, /* IsReachable = */ !KnownVal.isTrue());
 
+    if (I->isConsteval())
+      return Block;
+
     // Add the condition as the last statement in the new block.  This may
     // create new blocks as the condition may contain control-flow.  Any newly
     // created blocks will be pointed to be "Block".
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 59c932468cd89..d56609a2a8f24 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -125,6 +125,7 @@ static const OffloadArchToStringMap arch_names[] = {
     GFX(940),  // gfx940
     GFX(941),  // gfx941
     GFX(942),  // gfx942
+    GFX(950),  // gfx950
     {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"},
     GFX(1010), // gfx1010
     GFX(1011), // gfx1011
diff --git a/clang/lib/Basic/TargetDefines.h b/clang/lib/Basic/TargetDefines.h
new file mode 100644
index 0000000000000..96fc4fe70fa9d
--- /dev/null
+++ b/clang/lib/Basic/TargetDefines.h
@@ -0,0 +1,39 @@
+//===------- TargetDefines.h - Target define helpers ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a series of helper functions for defining target-specific
+// macros.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_BASIC_TARGETDEFINES_H
+#define LLVM_CLANG_LIB_BASIC_TARGETDEFINES_H
+
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/MacroBuilder.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace clang {
+namespace targets {
+/// Define a macro name and standard variants.  For example if MacroName is
+/// "unix", then this will define "__unix", "__unix__", and "unix" when in GNU
+/// mode.
+LLVM_LIBRARY_VISIBILITY
+void DefineStd(clang::MacroBuilder &Builder, llvm::StringRef MacroName,
+               const clang::LangOptions &Opts);
+
+LLVM_LIBRARY_VISIBILITY
+void defineCPUMacros(clang::MacroBuilder &Builder, llvm::StringRef CPUName,
+                     bool Tuning = true);
+
+LLVM_LIBRARY_VISIBILITY
+void addCygMingDefines(const clang::LangOptions &Opts,
+                       clang::MacroBuilder &Builder);
+} // namespace targets
+} // namespace clang
+#endif // LLVM_CLANG_LIB_BASIC_TARGETDEFINES_H
diff --git a/clang/lib/Basic/Targets.h b/clang/lib/Basic/Targets.h
index b4d2486b5d2b1..e1458384fa1c8 100644
--- a/clang/lib/Basic/Targets.h
+++ b/clang/lib/Basic/Targets.h
@@ -15,32 +15,7 @@
 #ifndef LLVM_CLANG_LIB_BASIC_TARGETS_H
 #define LLVM_CLANG_LIB_BASIC_TARGETS_H
 
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/MacroBuilder.h"
+#include "TargetDefines.h"
 #include "clang/Basic/TargetInfo.h"
-#include "llvm/ADT/StringRef.h"
 
-namespace clang {
-namespace targets {
-
-LLVM_LIBRARY_VISIBILITY
-std::unique_ptr<clang::TargetInfo>
-AllocateTarget(const llvm::Triple &Triple, const clang::TargetOptions &Opts);
-
-/// DefineStd - Define a macro name and standard variants.  For example if
-/// MacroName is "unix", then this will define "__unix", "__unix__", and "unix"
-/// when in GNU mode.
-LLVM_LIBRARY_VISIBILITY
-void DefineStd(clang::MacroBuilder &Builder, llvm::StringRef MacroName,
-               const clang::LangOptions &Opts);
-
-LLVM_LIBRARY_VISIBILITY
-void defineCPUMacros(clang::MacroBuilder &Builder, llvm::StringRef CPUName,
-                     bool Tuning = true);
-
-LLVM_LIBRARY_VISIBILITY
-void addCygMingDefines(const clang::LangOptions &Opts,
-                       clang::MacroBuilder &Builder);
-} // namespace targets
-} // namespace clang
 #endif // LLVM_CLANG_LIB_BASIC_TARGETS_H
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 0897032c4b854..dbc3fec365761 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -209,6 +209,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::GFX940:
       case OffloadArch::GFX941:
       case OffloadArch::GFX942:
+      case OffloadArch::GFX950:
       case OffloadArch::GFX10_1_GENERIC:
       case OffloadArch::GFX1010:
       case OffloadArch::GFX1011:
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index eaaba7642bd7b..c61ee7ee20392 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -108,6 +108,10 @@ bool RISCVTargetInfo::validateAsmConstraint(
       return true;
     }
     return false;
+  case 'R':
+    // An even-odd GPR pair
+    Info.setAllowsRegister();
+    return true;
   case 'v':
     // A vector register.
     if (Name[1] == 'r' || Name[1] == 'd' || Name[1] == 'm') {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index df69d188306be..0916e14f182dd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5213,6 +5213,136 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     Builder.SetInsertPoint(ContBB);
     return RValue::get(nullptr);
   }
+  case Builtin::BI__scoped_atomic_thread_fence: {
+    auto ScopeModel = AtomicScopeModel::create(AtomicScopeModelKind::Generic);
+
+    Value *Order = EmitScalarExpr(E->getArg(0));
+    Value *Scope = EmitScalarExpr(E->getArg(1));
+    auto Ord = dyn_cast<llvm::ConstantInt>(Order);
+    auto Scp = dyn_cast<llvm::ConstantInt>(Scope);
+    if (Ord && Scp) {
+      SyncScope SS = ScopeModel->isValid(Scp->getZExtValue())
+                         ? ScopeModel->map(Scp->getZExtValue())
+                         : ScopeModel->map(ScopeModel->getFallBackValue());
+      switch (Ord->getZExtValue()) {
+      case 0:  // memory_order_relaxed
+      default: // invalid order
+        break;
+      case 1: // memory_order_consume
+      case 2: // memory_order_acquire
+        Builder.CreateFence(
+            llvm::AtomicOrdering::Acquire,
+            getTargetHooks().getLLVMSyncScopeID(getLangOpts(), SS,
+                                                llvm::AtomicOrdering::Acquire,
+                                                getLLVMContext()));
+        break;
+      case 3: // memory_order_release
+        Builder.CreateFence(
+            llvm::AtomicOrdering::Release,
+            getTargetHooks().getLLVMSyncScopeID(getLangOpts(), SS,
+                                                llvm::AtomicOrdering::Release,
+                                                getLLVMContext()));
+        break;
+      case 4: // memory_order_acq_rel
+        Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease,
+                            getTargetHooks().getLLVMSyncScopeID(
+                                getLangOpts(), SS,
+                                llvm::AtomicOrdering::AcquireRelease,
+                                getLLVMContext()));
+        break;
+      case 5: // memory_order_seq_cst
+        Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
+                            getTargetHooks().getLLVMSyncScopeID(
+                                getLangOpts(), SS,
+                                llvm::AtomicOrdering::SequentiallyConsistent,
+                                getLLVMContext()));
+        break;
+      }
+      return RValue::get(nullptr);
+    }
+
+    llvm::BasicBlock *ContBB = createBasicBlock("atomic.scope.continue", CurFn);
+
+    llvm::SmallVector<std::pair<llvm::BasicBlock *, llvm::AtomicOrdering>>
+        OrderBBs;
+    if (Ord) {
+      switch (Ord->getZExtValue()) {
+      case 0:  // memory_order_relaxed
+      default: // invalid order
+        ContBB->eraseFromParent();
+        return RValue::get(nullptr);
+      case 1: // memory_order_consume
+      case 2: // memory_order_acquire
+        OrderBBs.emplace_back(Builder.GetInsertBlock(),
+                              llvm::AtomicOrdering::Acquire);
+        break;
+      case 3: // memory_order_release
+        OrderBBs.emplace_back(Builder.GetInsertBlock(),
+                              llvm::AtomicOrdering::Release);
+        break;
+      case 4: // memory_order_acq_rel
+        OrderBBs.emplace_back(Builder.GetInsertBlock(),
+                              llvm::AtomicOrdering::AcquireRelease);
+        break;
+      case 5: // memory_order_seq_cst
+        OrderBBs.emplace_back(Builder.GetInsertBlock(),
+                              llvm::AtomicOrdering::SequentiallyConsistent);
+        break;
+      }
+    } else {
+      llvm::BasicBlock *AcquireBB = createBasicBlock("acquire", CurFn);
+      llvm::BasicBlock *ReleaseBB = createBasicBlock("release", CurFn);
+      llvm::BasicBlock *AcqRelBB = createBasicBlock("acqrel", CurFn);
+      llvm::BasicBlock *SeqCstBB = createBasicBlock("seqcst", CurFn);
+
+      Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
+      llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
+      SI->addCase(Builder.getInt32(1), AcquireBB);
+      SI->addCase(Builder.getInt32(2), AcquireBB);
+      SI->addCase(Builder.getInt32(3), ReleaseBB);
+      SI->addCase(Builder.getInt32(4), AcqRelBB);
+      SI->addCase(Builder.getInt32(5), SeqCstBB);
+
+      OrderBBs.emplace_back(AcquireBB, llvm::AtomicOrdering::Acquire);
+      OrderBBs.emplace_back(ReleaseBB, llvm::AtomicOrdering::Release);
+      OrderBBs.emplace_back(AcqRelBB, llvm::AtomicOrdering::AcquireRelease);
+      OrderBBs.emplace_back(SeqCstBB,
+                            llvm::AtomicOrdering::SequentiallyConsistent);
+    }
+
+    for (auto &[OrderBB, Ordering] : OrderBBs) {
+      Builder.SetInsertPoint(OrderBB);
+      if (Scp) {
+        SyncScope SS = ScopeModel->isValid(Scp->getZExtValue())
+                           ? ScopeModel->map(Scp->getZExtValue())
+                           : ScopeModel->map(ScopeModel->getFallBackValue());
+        Builder.CreateFence(Ordering,
+                            getTargetHooks().getLLVMSyncScopeID(
+                                getLangOpts(), SS, Ordering, getLLVMContext()));
+        Builder.CreateBr(ContBB);
+      } else {
+        llvm::DenseMap<unsigned, llvm::BasicBlock *> BBs;
+        for (unsigned Scp : ScopeModel->getRuntimeValues())
+          BBs[Scp] = createBasicBlock(getAsString(ScopeModel->map(Scp)), CurFn);
+
+        auto *SC = Builder.CreateIntCast(Scope, Builder.getInt32Ty(), false);
+        llvm::SwitchInst *SI = Builder.CreateSwitch(SC, ContBB);
+        for (unsigned Scp : ScopeModel->getRuntimeValues()) {
+          auto *B = BBs[Scp];
+          SI->addCase(Builder.getInt32(Scp), B);
+
+          Builder.SetInsertPoint(B);
+          Builder.CreateFence(Ordering, getTargetHooks().getLLVMSyncScopeID(
+                                            getLangOpts(), ScopeModel->map(Scp),
+                                            Ordering, getLLVMContext()));
+          Builder.CreateBr(ContBB);
+        }
+      }
+    }
+
+    Builder.SetInsertPoint(ContBB);
+    return RValue::get(nullptr);
+  }
 
   case Builtin::BI__builtin_signbit:
   case Builtin::BI__builtin_signbitf:
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 6e9d28cea28e7..47b21bc9f63f0 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1899,13 +1899,16 @@ void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) {
   const Address Loc =
       locIsByrefHeader ? emission.getObjectAddress(*this) : emission.Addr;
 
+  auto hasNoTrivialAutoVarInitAttr = [&](const Decl *D) {
+    return D && D->hasAttr<NoTrivialAutoVarInitAttr>();
+  };
   // Note: constexpr already initializes everything correctly.
   LangOptions::TrivialAutoVarInitKind trivialAutoVarInit =
-      (D.isConstexpr()
+      ((D.isConstexpr() || D.getAttr<UninitializedAttr>() ||
+        hasNoTrivialAutoVarInitAttr(type->getAsTagDecl()) ||
+        hasNoTrivialAutoVarInitAttr(CurFuncDecl))
            ? LangOptions::TrivialAutoVarInitKind::Uninitialized
-           : (D.getAttr<UninitializedAttr>()
-                  ? LangOptions::TrivialAutoVarInitKind::Uninitialized
-                  : getContext().getLangOpts().getTrivialAutoVarInit()));
+           : getContext().getLangOpts().getTrivialAutoVarInit());
 
   auto initializeWhatIsTechnicallyUninitialized = [&](Address Loc) {
     if (trivialAutoVarInit ==
@@ -1944,13 +1947,13 @@ void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) {
                                   replaceUndef(CGM, isPattern, constant));
     }
 
-    if (constant && D.getType()->isBitIntType() &&
-        CGM.getTypes().typeRequiresSplitIntoByteArray(D.getType())) {
+    if (constant && type->isBitIntType() &&
+        CGM.getTypes().typeRequiresSplitIntoByteArray(type)) {
       // Constants for long _BitInt types are split into individual bytes.
       // Try to fold these back into an integer constant so it can be stored
       // properly.
-      llvm::Type *LoadType = CGM.getTypes().convertTypeForLoadStore(
-          D.getType(), constant->getType());
+      llvm::Type *LoadType =
+          CGM.getTypes().convertTypeForLoadStore(type, constant->getType());
       constant = llvm::ConstantFoldLoadFromConst(
           constant, LoadType, llvm::APInt::getZero(32), CGM.getDataLayout());
     }
@@ -1967,8 +1970,7 @@ void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) {
       // It may be that the Init expression uses other uninitialized memory,
       // but auto-var-init here would not help, as auto-init would get
       // overwritten by Init.
-      if (!D.getType()->isScalarType() || capturedByInit ||
-          isAccessedBy(D, Init)) {
+      if (!type->isScalarType() || capturedByInit || isAccessedBy(D, Init)) {
         initializeWhatIsTechnicallyUninitialized(Loc);
       }
     }
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index 16fdcf6b82853..ac31dff11b585 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -471,7 +471,7 @@ ComplexPairTy ComplexExprEmitter::VisitExpr(Expr *E) {
   CGF.ErrorUnsupported(E, "complex expression");
   llvm::Type *EltTy =
     CGF.ConvertType(getComplexType(E->getType())->getElementType());
-  llvm::Value *U = llvm::UndefValue::get(EltTy);
+  llvm::Value *U = llvm::PoisonValue::get(EltTy);
   return ComplexPairTy(U, U);
 }
 
@@ -1449,7 +1449,7 @@ ComplexPairTy ComplexExprEmitter::VisitVAArgExpr(VAArgExpr *E) {
     CGF.ErrorUnsupported(E, "complex va_arg expression");
     llvm::Type *EltTy =
       CGF.ConvertType(E->getType()->castAs<ComplexType>()->getElementType());
-    llvm::Value *U = llvm::UndefValue::get(EltTy);
+    llvm::Value *U = llvm::PoisonValue::get(EltTy);
     return ComplexPairTy(U, U);
   }
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 287d911e10ba5..4ae8a2b22b1bb 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1828,7 +1828,7 @@ Value *ScalarExprEmitter::VisitExpr(Expr *E) {
   CGF.ErrorUnsupported(E, "scalar expression");
   if (E->getType()->isVoidType())
     return nullptr;
-  return llvm::UndefValue::get(CGF.ConvertType(E->getType()));
+  return llvm::PoisonValue::get(CGF.ConvertType(E->getType()));
 }
 
 Value *
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 73e3f9e256f0d..756f0482b8ea7 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2304,6 +2304,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::GFX940:
       case OffloadArch::GFX941:
       case OffloadArch::GFX942:
+      case OffloadArch::GFX950:
       case OffloadArch::GFX10_1_GENERIC:
       case OffloadArch::GFX1010:
       case OffloadArch::GFX1011:
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 6a2f82f9e1390..ef6bb4f049d6e 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -486,7 +486,7 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   if (IndirectBranch) {
     llvm::PHINode *PN = cast<llvm::PHINode>(IndirectBranch->getAddress());
     if (PN->getNumIncomingValues() == 0) {
-      PN->replaceAllUsesWith(llvm::UndefValue::get(PN->getType()));
+      PN->replaceAllUsesWith(llvm::PoisonValue::get(PN->getType()));
       PN->eraseFromParent();
     }
   }
@@ -635,7 +635,9 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
 
   CGM.GenKernelArgMetadata(Fn, FD, this);
 
-  if (!getLangOpts().OpenCL)
+  if (!(getLangOpts().OpenCL ||
+        (getLangOpts().CUDA &&
+         getContext().getTargetInfo().getTriple().isSPIRV())))
     return;
 
   if (const VecTypeHintAttr *A = FD->getAttr<VecTypeHintAttr>()) {
@@ -1022,6 +1024,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   }
 
   if (FD && (getLangOpts().OpenCL ||
+             (getLangOpts().CUDA &&
+              getContext().getTargetInfo().getTriple().isSPIRV()) ||
              ((getLangOpts().HIP || getLangOpts().OffloadViaLLVM) &&
               getLangOpts().CUDAIsDevice))) {
     // Add metadata for a kernel function.
@@ -1106,8 +1110,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   // Create a marker to make it easy to insert allocas into the entryblock
   // later.  Don't create this with the builder, because we don't want it
   // folded.
-  llvm::Value *Undef = llvm::UndefValue::get(Int32Ty);
-  AllocaInsertPt = new llvm::BitCastInst(Undef, Int32Ty, "allocapt", EntryBB);
+  llvm::Value *Poison = llvm::PoisonValue::get(Int32Ty);
+  AllocaInsertPt = new llvm::BitCastInst(Poison, Int32Ty, "allocapt", EntryBB);
 
   ReturnBlock = getJumpDestInCurrentScope("return");
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 508f53482d4e1..b854eeb62a80c 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -733,7 +733,7 @@ void CodeGenModule::checkAliases() {
   for (const GlobalDecl &GD : Aliases) {
     StringRef MangledName = getMangledName(GD);
     llvm::GlobalValue *Alias = GetGlobalValue(MangledName);
-    Alias->replaceAllUsesWith(llvm::UndefValue::get(Alias->getType()));
+    Alias->replaceAllUsesWith(llvm::PoisonValue::get(Alias->getType()));
     Alias->eraseFromParent();
   }
 }
@@ -1213,6 +1213,9 @@ void CodeGenModule::Release() {
       getModule().addModuleFlag(llvm::Module::Min,
                                 "sign-return-address-with-bkey", 1);
 
+    if (LangOpts.PointerAuthELFGOT)
+      getModule().addModuleFlag(llvm::Module::Min, "ptrauth-elf-got", 1);
+
     if (getTriple().isOSLinux()) {
       assert(getTriple().isOSBinFormatELF());
       using namespace llvm::ELF;
@@ -5569,7 +5572,7 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
         }
       } else {
         ErrorUnsupported(D, "static initializer");
-        Init = llvm::UndefValue::get(getTypes().ConvertType(T));
+        Init = llvm::PoisonValue::get(getTypes().ConvertType(T));
       }
     } else {
       Init = Initializer;
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index b91233ee2c50a..1abfe8fd92807 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -65,9 +65,9 @@ static const SanitizerMask AlwaysRecoverable = SanitizerKind::KernelAddress |
 static const SanitizerMask NeedsLTO = SanitizerKind::CFI;
 static const SanitizerMask TrappingSupported =
     (SanitizerKind::Undefined & ~SanitizerKind::Vptr) | SanitizerKind::Integer |
-    SanitizerKind::Nullability | SanitizerKind::LocalBounds |
-    SanitizerKind::CFI | SanitizerKind::FloatDivideByZero |
-    SanitizerKind::ObjCCast;
+    SanitizerKind::ImplicitConversion | SanitizerKind::Nullability |
+    SanitizerKind::LocalBounds | SanitizerKind::CFI |
+    SanitizerKind::FloatDivideByZero | SanitizerKind::ObjCCast;
 static const SanitizerMask TrappingDefault = SanitizerKind::CFI;
 static const SanitizerMask CFIClasses =
     SanitizerKind::CFIVCall | SanitizerKind::CFINVCall |
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index cbba4289eb945..8d977149e6248 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1763,18 +1763,13 @@ Arg *tools::getLastProfileUseArg(const ArgList &Args) {
 
 Arg *tools::getLastProfileSampleUseArg(const ArgList &Args) {
   auto *ProfileSampleUseArg = Args.getLastArg(
-      options::OPT_fprofile_sample_use, options::OPT_fprofile_sample_use_EQ,
-      options::OPT_fauto_profile, options::OPT_fauto_profile_EQ,
-      options::OPT_fno_profile_sample_use, options::OPT_fno_auto_profile);
-
-  if (ProfileSampleUseArg &&
-      (ProfileSampleUseArg->getOption().matches(
-           options::OPT_fno_profile_sample_use) ||
-       ProfileSampleUseArg->getOption().matches(options::OPT_fno_auto_profile)))
+      options::OPT_fprofile_sample_use_EQ, options::OPT_fno_profile_sample_use);
+
+  if (ProfileSampleUseArg && (ProfileSampleUseArg->getOption().matches(
+                                 options::OPT_fno_profile_sample_use)))
     return nullptr;
 
-  return Args.getLastArg(options::OPT_fprofile_sample_use_EQ,
-                         options::OPT_fauto_profile_EQ);
+  return Args.getLastArg(options::OPT_fprofile_sample_use_EQ);
 }
 
 const char *tools::RelocationModelName(llvm::Reloc::Model Model) {
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 269cbef272079..bc5239209f3aa 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1554,7 +1554,7 @@ class AnnotatingParser {
         };
 
         if (IsInstancePort())
-          Tok->setFinalizedType(TT_VerilogInstancePortLParen);
+          Tok->setType(TT_VerilogInstancePortLParen);
       }
 
       if (!parseParens())
@@ -1730,7 +1730,7 @@ class AnnotatingParser {
         Tok->setType(TT_InheritanceComma);
         break;
       case Context::VerilogInstancePortList:
-        Tok->setFinalizedType(TT_VerilogInstancePortComma);
+        Tok->setType(TT_VerilogInstancePortComma);
         break;
       default:
         if (Style.isVerilog() && Contexts.size() == 1 &&
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 5f1dd38ef1eb3..c182aaf0876d1 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -4441,7 +4441,8 @@ unsigned UnwrappedLineParser::parseVerilogHierarchyHeader() {
           Prev->setFinalizedType(TT_VerilogDimensionedTypeName);
         parseSquare();
       } else if (Keywords.isVerilogIdentifier(*FormatTok) ||
-                 FormatTok->isOneOf(Keywords.kw_automatic, tok::kw_static)) {
+                 FormatTok->isOneOf(tok::hash, tok::hashhash, tok::coloncolon,
+                                    Keywords.kw_automatic, tok::kw_static)) {
         nextToken();
       } else {
         break;
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index e3145dcacf58d..3dd94c31b2bc7 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3452,6 +3452,8 @@ static void GeneratePointerAuthArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fptrauth_init_fini);
   if (Opts.PointerAuthInitFiniAddressDiscrimination)
     GenerateArg(Consumer, OPT_fptrauth_init_fini_address_discrimination);
+  if (Opts.PointerAuthELFGOT)
+    GenerateArg(Consumer, OPT_fptrauth_elf_got);
 }
 
 static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
@@ -3472,6 +3474,7 @@ static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
   Opts.PointerAuthInitFini = Args.hasArg(OPT_fptrauth_init_fini);
   Opts.PointerAuthInitFiniAddressDiscrimination =
       Args.hasArg(OPT_fptrauth_init_fini_address_discrimination);
+  Opts.PointerAuthELFGOT = Args.hasArg(OPT_fptrauth_elf_got);
 }
 
 /// Check if input file kind and language standard are compatible.
diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h
index dafc2a2f3e6a7..c065ea92a2dd5 100644
--- a/clang/lib/Headers/lasxintrin.h
+++ b/clang/lib/Headers/lasxintrin.h
@@ -2585,7 +2585,7 @@ extern __inline
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
     __lasx_xvorn_v(__m256i _1, __m256i _2) {
-  return (__m256i)__builtin_lasx_xvorn_v((v32i8)_1, (v32i8)_2);
+  return (__m256i)__builtin_lasx_xvorn_v((v32u8)_1, (v32u8)_2);
 }
 
 #define __lasx_xvldi(/*i13*/ _1) ((__m256i)__builtin_lasx_xvldi((_1)))
diff --git a/clang/lib/Headers/lsxintrin.h b/clang/lib/Headers/lsxintrin.h
index f347955ce6fb5..f020b0c18f0d2 100644
--- a/clang/lib/Headers/lsxintrin.h
+++ b/clang/lib/Headers/lsxintrin.h
@@ -3425,7 +3425,7 @@ extern __inline
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
     __lsx_vorn_v(__m128i _1, __m128i _2) {
-  return (__m128i)__builtin_lsx_vorn_v((v16i8)_1, (v16i8)_2);
+  return (__m128i)__builtin_lsx_vorn_v((v16u8)_1, (v16u8)_2);
 }
 
 #define __lsx_vldi(/*i13*/ _1) ((__m128i)__builtin_lsx_vldi((_1)))
diff --git a/clang/lib/Interpreter/CMakeLists.txt b/clang/lib/Interpreter/CMakeLists.txt
index d5ffe78251d25..df7ea82e0dada 100644
--- a/clang/lib/Interpreter/CMakeLists.txt
+++ b/clang/lib/Interpreter/CMakeLists.txt
@@ -10,7 +10,8 @@ set(LLVM_LINK_COMPONENTS
    Support
    Target
    TargetParser
-  )
+   TransformUtils
+   )
 
 if (EMSCRIPTEN AND "lld" IN_LIST LLVM_ENABLE_PROJECTS)
   set(WASM_SRC Wasm.cpp)
diff --git a/clang/lib/Interpreter/IncrementalExecutor.h b/clang/lib/Interpreter/IncrementalExecutor.h
index 7954cde36588b..dbd61f0b8b1eb 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.h
+++ b/clang/lib/Interpreter/IncrementalExecutor.h
@@ -56,7 +56,7 @@ class IncrementalExecutor {
   virtual llvm::Error addModule(PartialTranslationUnit &PTU);
   virtual llvm::Error removeModule(PartialTranslationUnit &PTU);
   virtual llvm::Error runCtors() const;
-  llvm::Error cleanUp();
+  virtual llvm::Error cleanUp();
   llvm::Expected<llvm::orc::ExecutorAddr>
   getSymbolAddress(llvm::StringRef Name, SymbolNameKind NameKind) const;
 
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index bc96da811d44c..94f0156ec151f 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -50,6 +50,9 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/Transforms/Utils/Cloning.h" // for CloneModule
+
+#define DEBUG_TYPE "clang-repl"
 
 using namespace clang;
 // FIXME: Figure out how to unify with namespace init_convenience from
@@ -196,7 +199,6 @@ IncrementalCompilerBuilder::CreateCpp() {
 #ifdef __EMSCRIPTEN__
   Argv.push_back("-target");
   Argv.push_back("wasm32-unknown-emscripten");
-  Argv.push_back("-pie");
   Argv.push_back("-shared");
 #endif
   Argv.insert(Argv.end(), UserArgs.begin(), UserArgs.end());
@@ -339,19 +341,8 @@ class IncrementalAction : public WrapperFrontendAction {
   }
 
   void ExecuteAction() override {
-    CompilerInstance &CI = getCompilerInstance();
-    assert(CI.hasPreprocessor() && "No PP!");
-
-    // Use a code completion consumer?
-    CodeCompleteConsumer *CompletionConsumer = nullptr;
-    if (CI.hasCodeCompletionConsumer())
-      CompletionConsumer = &CI.getCodeCompletionConsumer();
-
-    Preprocessor &PP = CI.getPreprocessor();
-    PP.EnterMainSourceFile();
-
-    if (!CI.hasSema())
-      CI.createSema(getTranslationUnitKind(), CompletionConsumer);
+    WrapperFrontendAction::ExecuteAction();
+    getCompilerInstance().getSema().CurContext = nullptr;
   }
 
   // Do not terminate after processing the input. This allows us to keep various
@@ -385,8 +376,6 @@ Interpreter::Interpreter(std::unique_ptr<CompilerInstance> Instance,
     return;
   CI->ExecuteAction(*Act);
 
-  ASTContext &C = CI->getASTContext();
-
   IncrParser = std::make_unique<IncrementalParser>(*CI, ErrOut);
 
   if (ErrOut)
@@ -394,18 +383,22 @@ Interpreter::Interpreter(std::unique_ptr<CompilerInstance> Instance,
 
   if (getCodeGen()) {
     CachedInCodeGenModule = GenModule();
+    // The initial PTU is filled by `-include` or by CUDA includes
+    // automatically.
+    if (!CI->getPreprocessorOpts().Includes.empty()) {
+      // We can't really directly pass the CachedInCodeGenModule to the Jit
+      // because it will steal it, causing dangling references as explained in
+      // Interpreter::Execute
+      auto M = llvm::CloneModule(*CachedInCodeGenModule);
+      ASTContext &C = CI->getASTContext();
+      RegisterPTU(C.getTranslationUnitDecl(), std::move(M));
+    }
     if (llvm::Error Err = CreateExecutor()) {
       ErrOut = joinErrors(std::move(ErrOut), std::move(Err));
       return;
     }
   }
 
-  // The initial PTU is filled by `-include` or by CUDA includes automatically.
-  RegisterPTU(C.getTranslationUnitDecl());
-
-  // Prepare the IncrParser for input.
-  llvm::cantFail(Parse(""));
-
   // Not all frontends support code-generation, e.g. ast-dump actions don't
   if (getCodeGen()) {
     // Process the PTUs that came from initialization. For example -include will
@@ -535,14 +528,25 @@ size_t Interpreter::getEffectivePTUSize() const {
   return PTUs.size() - InitPTUSize;
 }
 
-PartialTranslationUnit &Interpreter::RegisterPTU(TranslationUnitDecl *TU) {
+PartialTranslationUnit &
+Interpreter::RegisterPTU(TranslationUnitDecl *TU,
+                         std::unique_ptr<llvm::Module> M /*={}*/) {
   PTUs.emplace_back(PartialTranslationUnit());
   PartialTranslationUnit &LastPTU = PTUs.back();
   LastPTU.TUPart = TU;
 
-  if (std::unique_ptr<llvm::Module> M = GenModule())
-    LastPTU.TheModule = std::move(M);
+  if (!M)
+    M = GenModule();
+
+  assert((!getCodeGen() || M) && "Must have a llvm::Module at this point");
 
+  LastPTU.TheModule = std::move(M);
+  LLVM_DEBUG(llvm::dbgs() << "compile-ptu " << PTUs.size() - 1
+                          << ": [TU=" << LastPTU.TUPart);
+  if (LastPTU.TheModule)
+    LLVM_DEBUG(llvm::dbgs() << ", M=" << LastPTU.TheModule.get() << " ("
+                            << LastPTU.TheModule->getName() << ")");
+  LLVM_DEBUG(llvm::dbgs() << "]\n");
   return LastPTU;
 }
 
@@ -615,6 +619,14 @@ void Interpreter::ResetExecutor() { IncrExecutor.reset(); }
 
 llvm::Error Interpreter::Execute(PartialTranslationUnit &T) {
   assert(T.TheModule);
+  LLVM_DEBUG(llvm::dbgs()
+             << "execute-ptu "
+             << ((std::find(PTUs.begin(), PTUs.end(), T) != PTUs.end())
+                     ? std::distance(PTUs.begin(),
+                                     std::find(PTUs.begin(), PTUs.end(), T))
+                     : -1)
+             << ": [TU=" << T.TUPart << ", M=" << T.TheModule.get() << " ("
+             << T.TheModule->getName() << ")]\n");
   if (!IncrExecutor) {
     auto Err = CreateExecutor();
     if (Err)
@@ -723,10 +735,12 @@ std::unique_ptr<llvm::Module> Interpreter::GenModule() {
     // of the module which does not map well to CodeGen's design. To work this
     // around we created an empty module to make CodeGen happy. We should make
     // sure it always stays empty.
-    assert((!CachedInCodeGenModule || (CachedInCodeGenModule->empty() &&
-                                       CachedInCodeGenModule->global_empty() &&
-                                       CachedInCodeGenModule->alias_empty() &&
-                                       CachedInCodeGenModule->ifunc_empty())) &&
+    assert(((!CachedInCodeGenModule ||
+             !getCompilerInstance()->getPreprocessorOpts().Includes.empty()) ||
+            (CachedInCodeGenModule->empty() &&
+             CachedInCodeGenModule->global_empty() &&
+             CachedInCodeGenModule->alias_empty() &&
+             CachedInCodeGenModule->ifunc_empty())) &&
            "CodeGen wrote to a readonly module");
     std::unique_ptr<llvm::Module> M(CG->ReleaseModule());
     CG->StartModule("incr_module_" + std::to_string(ID++), M->getContext());
diff --git a/clang/lib/Interpreter/Wasm.cpp b/clang/lib/Interpreter/Wasm.cpp
index 1001410aa0f27..79efbaa03982d 100644
--- a/clang/lib/Interpreter/Wasm.cpp
+++ b/clang/lib/Interpreter/Wasm.cpp
@@ -72,13 +72,13 @@ llvm::Error WasmIncrementalExecutor::addModule(PartialTranslationUnit &PTU) {
   OutputFile.close();
 
   std::vector<const char *> LinkerArgs = {"wasm-ld",
-                                          "-pie",
+                                          "-shared",
                                           "--import-memory",
                                           "--no-entry",
                                           "--export-all",
                                           "--experimental-pic",
-                                          "--no-export-dynamic",
                                           "--stack-first",
+                                          "--allow-undefined",
                                           OutputFileName.c_str(),
                                           "-o",
                                           OutputFileName.c_str()};
@@ -109,6 +109,12 @@ llvm::Error WasmIncrementalExecutor::runCtors() const {
   return llvm::Error::success();
 }
 
+llvm::Error WasmIncrementalExecutor::cleanUp() const {
+  // Can't call cleanUp through IncrementalExecutor as it
+  // tries to deinitialize JIT which hasn't been initialized
+  return llvm::Error::success();
+}
+
 WasmIncrementalExecutor::~WasmIncrementalExecutor() = default;
 
 } // namespace clang
diff --git a/clang/lib/Interpreter/Wasm.h b/clang/lib/Interpreter/Wasm.h
index b1fd88024f14d..4632613326d39 100644
--- a/clang/lib/Interpreter/Wasm.h
+++ b/clang/lib/Interpreter/Wasm.h
@@ -28,6 +28,7 @@ class WasmIncrementalExecutor : public IncrementalExecutor {
   llvm::Error addModule(PartialTranslationUnit &PTU) override;
   llvm::Error removeModule(PartialTranslationUnit &PTU) override;
   llvm::Error runCtors() const override;
+  llvm::Error cleanUp() override;
 
   ~WasmIncrementalExecutor() override;
 };
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 239b70698e487..bcbf4dfbabafa 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -1454,7 +1454,7 @@ Decl *Parser::ParseObjCMethodDecl(SourceLocation mLoc,
 
   SmallVector<const IdentifierInfo *, 12> KeyIdents;
   SmallVector<SourceLocation, 12> KeyLocs;
-  SmallVector<ParmVarDecl *, 12> ObjCParamInfo;
+  SmallVector<SemaObjC::ObjCArgInfo, 12> ArgInfos;
   ParseScope PrototypeScope(this, Scope::FunctionPrototypeScope |
                             Scope::FunctionDeclarationScope | Scope::DeclScope);
 
@@ -1495,9 +1495,7 @@ Decl *Parser::ParseObjCMethodDecl(SourceLocation mLoc,
     ArgInfo.NameLoc = Tok.getLocation();
     ConsumeToken(); // Eat the identifier.
 
-    ParmVarDecl *Param = Actions.ObjC().ActOnMethodParmDeclaration(
-        getCurScope(), ArgInfo, ObjCParamInfo.size(), MethodDefinition);
-    ObjCParamInfo.push_back(Param);
+    ArgInfos.push_back(ArgInfo);
     KeyIdents.push_back(SelIdent);
     KeyLocs.push_back(selLoc);
 
@@ -1557,6 +1555,17 @@ Decl *Parser::ParseObjCMethodDecl(SourceLocation mLoc,
                                                     nullptr));
   }
 
+  // Turn ArgInfos into parameters. This must happen after parsing all
+  // parameters for bug compatibility with previous versions of Clang. (For
+  // instance, if a method declares a parameter called "id", that parameter must
+  // not shadow the "id" type.)
+  SmallVector<ParmVarDecl *, 12> ObjCParamInfo;
+  for (auto &ArgInfo : ArgInfos) {
+    ParmVarDecl *Param = Actions.ObjC().ActOnMethodParmDeclaration(
+        getCurScope(), ArgInfo, ObjCParamInfo.size(), MethodDefinition);
+    ObjCParamInfo.push_back(Param);
+  }
+
   // FIXME: Add support for optional parameter list...
   // If attributes exist after the method, parse them.
   MaybeParseAttributes(PAKM_CXX11 | (getLangOpts().ObjC ? PAKM_GNU : 0),
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index f11fd3a7e4038..075c0df3f5496 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2262,19 +2262,28 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
         MsgParam = 5;
       } else if (const auto *ECE = dyn_cast<ExplicitCastExpr>(Operation)) {
         QualType destType = ECE->getType();
+        bool destTypeComplete = true;
+
         if (!isa<PointerType>(destType))
           return;
+        destType = destType.getTypePtr()->getPointeeType();
+        if (const auto *D = destType->getAsTagDecl())
+          destTypeComplete = D->isCompleteDefinition();
 
-        const uint64_t dSize =
-            Ctx.getTypeSize(destType.getTypePtr()->getPointeeType());
+        // If destination type is incomplete, it is unsafe to cast to anyway, no
+        // need to check its type:
+        if (destTypeComplete) {
+          const uint64_t dSize = Ctx.getTypeSize(destType);
+          QualType srcType = ECE->getSubExpr()->getType();
 
-        QualType srcType = ECE->getSubExpr()->getType();
-        const uint64_t sSize =
-            Ctx.getTypeSize(srcType.getTypePtr()->getPointeeType());
+          assert(srcType->isPointerType());
 
-        if (sSize >= dSize)
-          return;
+          const uint64_t sSize =
+              Ctx.getTypeSize(srcType.getTypePtr()->getPointeeType());
 
+          if (sSize >= dSize)
+            return;
+        }
         if (const auto *CE = dyn_cast<CXXMemberCallExpr>(
                 ECE->getSubExpr()->IgnoreParens())) {
           D = CE->getMethodDecl();
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index a1a402b4a2b53..8886e5e307ddf 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -45,10 +45,14 @@ enum LifetimeKind {
   /// a default member initializer), the program is ill-formed.
   LK_MemInitializer,
 
-  /// The lifetime of a temporary bound to this entity probably ends too soon,
+  /// The lifetime of a temporary bound to this entity may end too soon,
   /// because the entity is a pointer and we assign the address of a temporary
   /// object to it.
   LK_Assignment,
+
+  /// The lifetime of a temporary bound to this entity may end too soon,
+  /// because the entity may capture the reference to a temporary object.
+  LK_LifetimeCapture,
 };
 using LifetimeResult =
     llvm::PointerIntPair<const InitializedEntity *, 3, LifetimeKind>;
@@ -1095,9 +1099,7 @@ static bool pathOnlyHandlesGslPointer(const IndirectLocalPath &Path) {
 }
 
 static bool isAssignmentOperatorLifetimeBound(CXXMethodDecl *CMD) {
-  if (!CMD)
-    return false;
-  return isNormalAssignmentOperator(CMD) && CMD->param_size() == 1 &&
+  return CMD && isNormalAssignmentOperator(CMD) && CMD->param_size() == 1 &&
          CMD->getParamDecl(0)->hasAttr<LifetimeBoundAttr>();
 }
 
@@ -1110,13 +1112,14 @@ static bool shouldRunGSLAssignmentAnalysis(const Sema &SemaRef,
            isAssignmentOperatorLifetimeBound(Entity.AssignmentOperator)));
 }
 
-static void checkExprLifetimeImpl(Sema &SemaRef,
-                                  const InitializedEntity *InitEntity,
-                                  const InitializedEntity *ExtendingEntity,
-                                  LifetimeKind LK,
-                                  const AssignedEntity *AEntity, Expr *Init) {
-  assert((AEntity && LK == LK_Assignment) ||
-         (InitEntity && LK != LK_Assignment));
+static void
+checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity,
+                      const InitializedEntity *ExtendingEntity, LifetimeKind LK,
+                      const AssignedEntity *AEntity,
+                      const CapturingEntity *CapEntity, Expr *Init) {
+  assert(!AEntity || LK == LK_Assignment);
+  assert(!CapEntity || LK == LK_LifetimeCapture);
+  assert(!InitEntity || (LK != LK_Assignment && LK != LK_LifetimeCapture));
   // If this entity doesn't have an interesting lifetime, don't bother looking
   // for temporaries within its initializer.
   if (LK == LK_FullExpression)
@@ -1199,12 +1202,23 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
       break;
     }
 
+    case LK_LifetimeCapture: {
+      // The captured entity has lifetime beyond the full-expression,
+      // and the capturing entity does too, so don't warn.
+      if (!MTE)
+        return false;
+      if (CapEntity->Entity)
+        SemaRef.Diag(DiagLoc, diag::warn_dangling_reference_captured)
+            << CapEntity->Entity << DiagRange;
+      else
+        SemaRef.Diag(DiagLoc, diag::warn_dangling_reference_captured_by_unknown)
+            << DiagRange;
+      return false;
+    }
+
     case LK_Assignment: {
       if (!MTE || pathContainsInit(Path))
         return false;
-      assert(shouldLifetimeExtendThroughPath(Path) ==
-                 PathLifetimeKind::NoExtend &&
-             "No lifetime extension for assignments");
       if (IsGslPtrValueFromGslTempOwner)
         SemaRef.Diag(DiagLoc, diag::warn_dangling_lifetime_pointer_assignment)
             << AEntity->LHS << DiagRange;
@@ -1413,13 +1427,23 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
   };
 
   llvm::SmallVector<IndirectLocalPathEntry, 8> Path;
-  if (LK == LK_Assignment &&
-      shouldRunGSLAssignmentAnalysis(SemaRef, *AEntity)) {
-    Path.push_back(
-        {isAssignmentOperatorLifetimeBound(AEntity->AssignmentOperator)
-             ? IndirectLocalPathEntry::LifetimeBoundCall
-             : IndirectLocalPathEntry::GslPointerAssignment,
-         Init});
+  switch (LK) {
+  case LK_Assignment: {
+    if (shouldRunGSLAssignmentAnalysis(SemaRef, *AEntity))
+      Path.push_back(
+          {isAssignmentOperatorLifetimeBound(AEntity->AssignmentOperator)
+               ? IndirectLocalPathEntry::LifetimeBoundCall
+               : IndirectLocalPathEntry::GslPointerAssignment,
+           Init});
+    break;
+  }
+  case LK_LifetimeCapture: {
+    if (isPointerLikeType(Init->getType()))
+      Path.push_back({IndirectLocalPathEntry::GslPointerInit, Init});
+    break;
+  }
+  default:
+    break;
   }
 
   if (Init->isGLValue())
@@ -1432,23 +1456,23 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
         /*RevisitSubinits=*/!InitEntity);
 }
 
-void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity,
+void checkInitLifetime(Sema &SemaRef, const InitializedEntity &Entity,
                        Expr *Init) {
   auto LTResult = getEntityLifetime(&Entity);
   LifetimeKind LK = LTResult.getInt();
   const InitializedEntity *ExtendingEntity = LTResult.getPointer();
   checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK,
-                        /*AEntity*/ nullptr, Init);
+                        /*AEntity=*/nullptr, /*CapEntity=*/nullptr, Init);
 }
 
 void checkExprLifetimeMustTailArg(Sema &SemaRef,
                                   const InitializedEntity &Entity, Expr *Init) {
   checkExprLifetimeImpl(SemaRef, &Entity, nullptr, LK_MustTail,
-                        /*AEntity*/ nullptr, Init);
+                        /*AEntity=*/nullptr, /*CapEntity=*/nullptr, Init);
 }
 
-void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity,
-                       Expr *Init) {
+void checkAssignmentLifetime(Sema &SemaRef, const AssignedEntity &Entity,
+                             Expr *Init) {
   bool EnableDanglingPointerAssignment = !SemaRef.getDiagnostics().isIgnored(
       diag::warn_dangling_pointer_assignment, SourceLocation());
   bool RunAnalysis = (EnableDanglingPointerAssignment &&
@@ -1460,7 +1484,20 @@ void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity,
 
   checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr,
                         /*ExtendingEntity=*/nullptr, LK_Assignment, &Entity,
-                        Init);
+                        /*CapEntity=*/nullptr, Init);
+}
+
+void checkCaptureByLifetime(Sema &SemaRef, const CapturingEntity &Entity,
+                            Expr *Init) {
+  if (SemaRef.getDiagnostics().isIgnored(diag::warn_dangling_reference_captured,
+                                         SourceLocation()) &&
+      SemaRef.getDiagnostics().isIgnored(
+          diag::warn_dangling_reference_captured_by_unknown, SourceLocation()))
+    return;
+  return checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr,
+                               /*ExtendingEntity=*/nullptr, LK_LifetimeCapture,
+                               /*AEntity=*/nullptr,
+                               /*CapEntity=*/&Entity, Init);
 }
 
 } // namespace clang::sema
diff --git a/clang/lib/Sema/CheckExprLifetime.h b/clang/lib/Sema/CheckExprLifetime.h
index 903f312f3533e..38b7061988dc7 100644
--- a/clang/lib/Sema/CheckExprLifetime.h
+++ b/clang/lib/Sema/CheckExprLifetime.h
@@ -25,15 +25,31 @@ struct AssignedEntity {
   CXXMethodDecl *AssignmentOperator = nullptr;
 };
 
+struct CapturingEntity {
+  // In an function call involving a lifetime capture, this would be the
+  // argument capturing the lifetime of another argument.
+  //    void addToSet(std::string_view sv [[clang::lifetime_capture_by(setsv)]],
+  //                  set<std::string_view>& setsv);
+  //    set<std::string_view> setsv;
+  //    addToSet(std::string(), setsv); // Here 'setsv' is the 'Entity'.
+  //
+  // This is 'nullptr' when the capturing entity is 'global' or 'unknown'.
+  Expr *Entity = nullptr;
+};
+
 /// Check that the lifetime of the given expr (and its subobjects) is
 /// sufficient for initializing the entity, and perform lifetime extension
 /// (when permitted) if not.
-void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity,
+void checkInitLifetime(Sema &SemaRef, const InitializedEntity &Entity,
                        Expr *Init);
 
 /// Check that the lifetime of the given expr (and its subobjects) is
 /// sufficient for assigning to the entity.
-void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, Expr *Init);
+void checkAssignmentLifetime(Sema &SemaRef, const AssignedEntity &Entity,
+                             Expr *Init);
+
+void checkCaptureByLifetime(Sema &SemaRef, const CapturingEntity &Entity,
+                            Expr *Init);
 
 /// Check that the lifetime of the given expr (and its subobjects) is
 /// sufficient, assuming that it is passed as an argument to a musttail
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index cac15b974a276..a14e7d50a6043 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -139,7 +139,7 @@ struct BuiltinTypeDeclBuilder {
     Attr *ResourceAttr = HLSLResourceAttr::CreateImplicit(Ctx, RK);
     if (CreateHLSLAttributedResourceType(S, Ctx.HLSLResourceTy, Attrs,
                                          AttributedResTy))
-      addMemberVariable("h", AttributedResTy, {ResourceAttr}, Access);
+      addMemberVariable("__handle", AttributedResTy, {ResourceAttr}, Access);
     return *this;
   }
 
@@ -212,11 +212,11 @@ struct BuiltinTypeDeclBuilder {
 
     // Subscript operators return references to elements, const makes the
     // reference and method const so that the underlying data is not mutable.
-    ReturnTy = AST.getLValueReferenceType(ReturnTy);
     if (IsConst) {
       ExtInfo.TypeQuals.addConst();
       ReturnTy.addConst();
     }
+    ReturnTy = AST.getLValueReferenceType(ReturnTy);
 
     QualType MethodTy =
         AST.getFunctionType(ReturnTy, {AST.UnsignedIntTy}, ExtInfo);
@@ -480,8 +480,8 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
-                    ResourceKind::TypedBuffer,
-                    /*IsROV=*/false, /*RawBuffer=*/false)
+                    ResourceKind::TypedBuffer, /*IsROV=*/false,
+                    /*RawBuffer=*/false)
         .addArraySubscriptOperators()
         .completeDefinition();
   });
@@ -503,8 +503,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
              .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, ResourceKind::RawBuffer,
-                    /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*IsROV=*/false, /*RawBuffer=*/true)
         .addArraySubscriptOperators()
         .completeDefinition();
   });
@@ -514,8 +513,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
              .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer,
-                    /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*IsROV=*/false, /*RawBuffer=*/true)
         .addArraySubscriptOperators()
         .completeDefinition();
   });
@@ -526,8 +524,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
           .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer,
-                    /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*IsROV=*/false, /*RawBuffer=*/true)
         .completeDefinition();
   });
 
@@ -537,8 +534,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
           .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer,
-                    /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*IsROV=*/false, /*RawBuffer=*/true)
         .completeDefinition();
   });
 
@@ -547,9 +543,8 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
              .addSimpleTemplateParams(*SemaPtr, {"element_type"})
              .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
-    setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
-                    ResourceKind::TypedBuffer, /*IsROV=*/true,
-                    /*RawBuffer=*/true)
+    setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer,
+                    /*IsROV=*/true, /*RawBuffer=*/true)
         .addArraySubscriptOperators()
         .completeDefinition();
   });
diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp
index 8c830769a969c..d465599450e7f 100644
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@@ -179,9 +179,9 @@ static ScopePair GetDiagForGotoScopeDecl(Sema &S, const Decl *D) {
       }
     }
 
-    const Expr *Init = VD->getInit();
-    if (S.Context.getLangOpts().CPlusPlus && VD->hasLocalStorage() && Init &&
-        !Init->containsErrors()) {
+    if (const Expr *Init = VD->getInit(); S.Context.getLangOpts().CPlusPlus &&
+                                          VD->hasLocalStorage() && Init &&
+                                          !Init->containsErrors()) {
       // C++11 [stmt.dcl]p3:
       //   A program that jumps from a point where a variable with automatic
       //   storage duration is not in scope to a point where it is in scope
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2d4a7cd287b70..2fd990750ed21 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CheckExprLifetime.h"
 #include "clang/AST/APValue.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
@@ -3222,6 +3223,47 @@ void Sema::CheckArgAlignment(SourceLocation Loc, NamedDecl *FDecl,
         << ParamName << (FDecl != nullptr) << FDecl;
 }
 
+void Sema::checkLifetimeCaptureBy(FunctionDecl *FD, bool IsMemberFunction,
+                                  const Expr *ThisArg,
+                                  ArrayRef<const Expr *> Args) {
+  if (!FD || Args.empty())
+    return;
+  auto GetArgAt = [&](int Idx) -> const Expr * {
+    if (Idx == LifetimeCaptureByAttr::GLOBAL ||
+        Idx == LifetimeCaptureByAttr::UNKNOWN)
+      return nullptr;
+    if (IsMemberFunction && Idx == 0)
+      return ThisArg;
+    return Args[Idx - IsMemberFunction];
+  };
+  auto HandleCaptureByAttr = [&](const LifetimeCaptureByAttr *Attr,
+                                 unsigned ArgIdx) {
+    if (!Attr)
+      return;
+    Expr *Captured = const_cast<Expr *>(GetArgAt(ArgIdx));
+    for (int CapturingParamIdx : Attr->params()) {
+      Expr *Capturing = const_cast<Expr *>(GetArgAt(CapturingParamIdx));
+      CapturingEntity CE{Capturing};
+      // Ensure that 'Captured' outlives the 'Capturing' entity.
+      checkCaptureByLifetime(*this, CE, Captured);
+    }
+  };
+  for (unsigned I = 0; I < FD->getNumParams(); ++I)
+    HandleCaptureByAttr(FD->getParamDecl(I)->getAttr<LifetimeCaptureByAttr>(),
+                        I + IsMemberFunction);
+  // Check when the implicit object param is captured.
+  if (IsMemberFunction) {
+    TypeSourceInfo *TSI = FD->getTypeSourceInfo();
+    if (!TSI)
+      return;
+    AttributedTypeLoc ATL;
+    for (TypeLoc TL = TSI->getTypeLoc();
+         (ATL = TL.getAsAdjusted<AttributedTypeLoc>());
+         TL = ATL.getModifiedLoc())
+      HandleCaptureByAttr(ATL.getAttrAs<LifetimeCaptureByAttr>(), 0);
+  }
+}
+
 void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
                      const Expr *ThisArg, ArrayRef<const Expr *> Args,
                      bool IsMemberFunction, SourceLocation Loc,
@@ -3262,7 +3304,8 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
       }
     }
   }
-
+  if (FD)
+    checkLifetimeCaptureBy(FD, IsMemberFunction, ThisArg, Args);
   if (FDecl || Proto) {
     CheckNonNullArguments(*this, FDecl, Proto, Args, Loc);
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index a36ca61a1bef3..be570f3a1829d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8350,9 +8350,15 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
     return;
 
   // Only warn about certain kinds of shadowing for class members.
-  if (NewDC && NewDC->isRecord()) {
+  if (NewDC) {
     // In particular, don't warn about shadowing non-class members.
-    if (!OldDC->isRecord())
+    if (NewDC->isRecord() && !OldDC->isRecord())
+      return;
+
+    // Skip shadowing check if we're in a class scope, dealing with an enum
+    // constant in a different context.
+    DeclContext *ReDC = NewDC->getRedeclContext();
+    if (ReDC->isRecord() && isa<EnumConstantDecl>(D) && !OldDC->Equals(ReDC))
       return;
 
     // TODO: should we warn about static data members shadowing
@@ -8363,7 +8369,6 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
     // shadowing context, but that's just a false negative.
   }
 
-
   DeclarationName Name = R.getLookupName();
 
   // Emit warning and note.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 0f5baa1e1eb36..146d9c86e0715 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7368,7 +7368,9 @@ void Sema::ProcessDeclAttributeList(
   // good to have a way to specify "these attributes must appear as a group",
   // for these. Additionally, it would be good to have a way to specify "these
   // attribute must never appear as a group" for attributes like cold and hot.
-  if (!D->hasAttr<OpenCLKernelAttr>()) {
+  if (!(D->hasAttr<OpenCLKernelAttr>() ||
+        (D->hasAttr<CUDAGlobalAttr>() &&
+         Context.getTargetInfo().getTriple().isSPIRV()))) {
     // These attributes cannot be applied to a non-kernel function.
     if (const auto *A = D->getAttr<ReqdWorkGroupSizeAttr>()) {
       // FIXME: This emits a different error message than
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index dcf495b700540..6c7472ce92703 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -13821,7 +13821,7 @@ QualType Sema::CheckAssignmentOperands(Expr *LHSExpr, ExprResult &RHS,
   CheckForNullPointerDereference(*this, LHSExpr);
 
   AssignedEntity AE{LHSExpr};
-  checkExprLifetime(*this, AE, RHS.get());
+  checkAssignmentLifetime(*this, AE, RHS.get());
 
   if (getLangOpts().CPlusPlus20 && LHSType.isVolatileQualified()) {
     if (CompoundType.isNull()) {
diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 4b5ddb74b1262..6fe4d2353a228 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -807,7 +807,8 @@ class Analyzer {
 
     auto MaybeAddTemplateNote = [&](const Decl *D) {
       if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
-        while (FD != nullptr && FD->isTemplateInstantiation()) {
+        while (FD != nullptr && FD->isTemplateInstantiation() &&
+               FD->getPointOfInstantiation().isValid()) {
           S.Diag(FD->getPointOfInstantiation(),
                  diag::note_func_effect_from_template);
           FD = FD->getTemplateInstantiationPattern();
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 1e98a074894ad..7c03a12e81280 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -7401,7 +7401,7 @@ PerformConstructorInitialization(Sema &S,
 
 void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
                                     Expr *Init) {
-  return sema::checkExprLifetime(*this, Entity, Init);
+  return sema::checkInitLifetime(*this, Entity, Init);
 }
 
 static void DiagnoseNarrowingInInitList(Sema &S,
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index e7afa0f4c81fc..a67c0b2b367d1 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -1950,8 +1950,6 @@ ExprResult Sema::ActOnLambdaExpr(SourceLocation StartLoc, Stmt *Body) {
   LambdaScopeInfo LSI = *cast<LambdaScopeInfo>(FunctionScopes.back());
   ActOnFinishFunctionBody(LSI.CallOperator, Body);
 
-  maybeAddDeclWithEffects(LSI.CallOperator);
-
   return BuildLambdaExpr(StartLoc, Body->getEndLoc(), &LSI);
 }
 
@@ -2284,6 +2282,7 @@ ExprResult Sema::BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc,
     case ExpressionEvaluationContext::PotentiallyEvaluatedIfUsed:
       break;
     }
+    maybeAddDeclWithEffects(LSI->CallOperator);
   }
 
   return MaybeBindToTemporary(Lambda);
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index a239f2c6e88e4..e4bf9aa521224 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14809,7 +14809,7 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
           // Check for a self move.
           DiagnoseSelfMove(Args[0], Args[1], OpLoc);
           // lifetime check.
-          checkExprLifetime(
+          checkAssignmentLifetime(
               *this, AssignedEntity{Args[0], dyn_cast<CXXMethodDecl>(FnDecl)},
               Args[1]);
         }
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index f3ee5211acdd1..d6bc66246c758 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -200,23 +200,30 @@ static bool DiagnoseUnusedComparison(Sema &S, const Expr *E) {
   return true;
 }
 
-static bool DiagnoseNoDiscard(Sema &S, const WarnUnusedResultAttr *A,
-                              SourceLocation Loc, SourceRange R1,
-                              SourceRange R2, bool IsCtor) {
+static bool DiagnoseNoDiscard(Sema &S, const NamedDecl *OffendingDecl,
+                              const WarnUnusedResultAttr *A, SourceLocation Loc,
+                              SourceRange R1, SourceRange R2, bool IsCtor) {
   if (!A)
     return false;
   StringRef Msg = A->getMessage();
 
   if (Msg.empty()) {
+    if (OffendingDecl)
+      return S.Diag(Loc, diag::warn_unused_return_type)
+             << IsCtor << A << OffendingDecl << false << R1 << R2;
     if (IsCtor)
-      return S.Diag(Loc, diag::warn_unused_constructor) << A << R1 << R2;
-    return S.Diag(Loc, diag::warn_unused_result) << A << R1 << R2;
+      return S.Diag(Loc, diag::warn_unused_constructor)
+             << A << false << R1 << R2;
+    return S.Diag(Loc, diag::warn_unused_result) << A << false << R1 << R2;
   }
 
+  if (OffendingDecl)
+    return S.Diag(Loc, diag::warn_unused_return_type)
+           << IsCtor << A << OffendingDecl << true << Msg << R1 << R2;
   if (IsCtor)
-    return S.Diag(Loc, diag::warn_unused_constructor_msg) << A << Msg << R1
-                                                          << R2;
-  return S.Diag(Loc, diag::warn_unused_result_msg) << A << Msg << R1 << R2;
+    return S.Diag(Loc, diag::warn_unused_constructor)
+           << A << true << Msg << R1 << R2;
+  return S.Diag(Loc, diag::warn_unused_result) << A << true << Msg << R1 << R2;
 }
 
 void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) {
@@ -286,9 +293,10 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) {
     if (E->getType()->isVoidType())
       return;
 
-    if (DiagnoseNoDiscard(*this, cast_or_null<WarnUnusedResultAttr>(
-                                     CE->getUnusedResultAttr(Context)),
-                          Loc, R1, R2, /*isCtor=*/false))
+    auto [OffendingDecl, A] = CE->getUnusedResultAttr(Context);
+    if (DiagnoseNoDiscard(*this, OffendingDecl,
+                          cast_or_null<WarnUnusedResultAttr>(A), Loc, R1, R2,
+                          /*isCtor=*/false))
       return;
 
     // If the callee has attribute pure, const, or warn_unused_result, warn with
@@ -309,16 +317,21 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) {
     }
   } else if (const auto *CE = dyn_cast<CXXConstructExpr>(E)) {
     if (const CXXConstructorDecl *Ctor = CE->getConstructor()) {
+      const NamedDecl *OffendingDecl = nullptr;
       const auto *A = Ctor->getAttr<WarnUnusedResultAttr>();
-      A = A ? A : Ctor->getParent()->getAttr<WarnUnusedResultAttr>();
-      if (DiagnoseNoDiscard(*this, A, Loc, R1, R2, /*isCtor=*/true))
+      if (!A) {
+        OffendingDecl = Ctor->getParent();
+        A = OffendingDecl->getAttr<WarnUnusedResultAttr>();
+      }
+      if (DiagnoseNoDiscard(*this, OffendingDecl, A, Loc, R1, R2,
+                            /*isCtor=*/true))
         return;
     }
   } else if (const auto *ILE = dyn_cast<InitListExpr>(E)) {
     if (const TagDecl *TD = ILE->getType()->getAsTagDecl()) {
 
-      if (DiagnoseNoDiscard(*this, TD->getAttr<WarnUnusedResultAttr>(), Loc, R1,
-                            R2, /*isCtor=*/false))
+      if (DiagnoseNoDiscard(*this, TD, TD->getAttr<WarnUnusedResultAttr>(), Loc,
+                            R1, R2, /*isCtor=*/false))
         return;
     }
   } else if (ShouldSuppress)
@@ -332,8 +345,8 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) {
     }
     const ObjCMethodDecl *MD = ME->getMethodDecl();
     if (MD) {
-      if (DiagnoseNoDiscard(*this, MD->getAttr<WarnUnusedResultAttr>(), Loc, R1,
-                            R2, /*isCtor=*/false))
+      if (DiagnoseNoDiscard(*this, nullptr, MD->getAttr<WarnUnusedResultAttr>(),
+                            Loc, R1, R2, /*isCtor=*/false))
         return;
     }
   } else if (const PseudoObjectExpr *POE = dyn_cast<PseudoObjectExpr>(E)) {
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 76e3fcc124178..f32edc5ac0644 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -4882,9 +4882,17 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
                       cast<AutoType>(T)->getKeyword() !=
                           AutoTypeKeyword::Auto ||
                       cast<AutoType>(T)->isConstrained())) {
-            S.Diag(D.getDeclSpec().getTypeSpecTypeLoc(),
-                   diag::err_trailing_return_without_auto)
-                << T << D.getDeclSpec().getSourceRange();
+            // Attach a valid source location for diagnostics on functions with
+            // trailing return types missing 'auto'. Attempt to get the location
+            // from the declared type; if invalid, fall back to the trailing
+            // return type's location.
+            SourceLocation Loc = D.getDeclSpec().getTypeSpecTypeLoc();
+            SourceRange SR = D.getDeclSpec().getSourceRange();
+            if (Loc.isInvalid()) {
+              Loc = FTI.getTrailingReturnTypeLoc();
+              SR = D.getSourceRange();
+            }
+            S.Diag(Loc, diag::err_trailing_return_without_auto) << T << SR;
             D.setInvalidType(true);
             // FIXME: recover and fill decls in `TypeLoc`s.
             AreDeclaratorChunksValid = false;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 8b928ede395ae..ec85fad3389a1 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -3092,98 +3092,97 @@ ASTReader::ReadControlBlock(ModuleFile &F,
       break;
     }
 
-    case IMPORTS: {
+    case IMPORT: {
       // Validate the AST before processing any imports (otherwise, untangling
       // them can be error-prone and expensive).  A module will have a name and
       // will already have been validated, but this catches the PCH case.
       if (ASTReadResult Result = readUnhashedControlBlockOnce())
         return Result;
 
-      // Load each of the imported PCH files.
-      unsigned Idx = 0, N = Record.size();
-      while (Idx < N) {
-        // Read information about the AST file.
-        ModuleKind ImportedKind = (ModuleKind)Record[Idx++];
-        // Whether we're importing a standard c++ module.
-        bool IsImportingStdCXXModule = Record[Idx++];
-        // The import location will be the local one for now; we will adjust
-        // all import locations of module imports after the global source
-        // location info are setup, in ReadAST.
-        auto [ImportLoc, ImportModuleFileIndex] =
-            ReadUntranslatedSourceLocation(Record[Idx++]);
-        // The import location must belong to the current module file itself.
-        assert(ImportModuleFileIndex == 0);
-        off_t StoredSize = !IsImportingStdCXXModule ? (off_t)Record[Idx++] : 0;
-        time_t StoredModTime =
-            !IsImportingStdCXXModule ? (time_t)Record[Idx++] : 0;
-
-        ASTFileSignature StoredSignature;
-        if (!IsImportingStdCXXModule) {
-          auto FirstSignatureByte = Record.begin() + Idx;
-          StoredSignature = ASTFileSignature::create(
-              FirstSignatureByte, FirstSignatureByte + ASTFileSignature::size);
-          Idx += ASTFileSignature::size;
-        }
+      unsigned Idx = 0;
+      // Read information about the AST file.
+      ModuleKind ImportedKind = (ModuleKind)Record[Idx++];
+
+      // The import location will be the local one for now; we will adjust
+      // all import locations of module imports after the global source
+      // location info are setup, in ReadAST.
+      auto [ImportLoc, ImportModuleFileIndex] =
+          ReadUntranslatedSourceLocation(Record[Idx++]);
+      // The import location must belong to the current module file itself.
+      assert(ImportModuleFileIndex == 0);
+
+      StringRef ImportedName = ReadStringBlob(Record, Idx, Blob);
+
+      bool IsImportingStdCXXModule = Record[Idx++];
+
+      off_t StoredSize = 0;
+      time_t StoredModTime = 0;
+      ASTFileSignature StoredSignature;
+      std::string ImportedFile;
+
+      // For prebuilt and explicit modules first consult the file map for
+      // an override. Note that here we don't search prebuilt module
+      // directories if we're not importing standard c++ module, only the
+      // explicit name to file mappings. Also, we will still verify the
+      // size/signature making sure it is essentially the same file but
+      // perhaps in a different location.
+      if (ImportedKind == MK_PrebuiltModule || ImportedKind == MK_ExplicitModule)
+        ImportedFile = PP.getHeaderSearchInfo().getPrebuiltModuleFileName(
+            ImportedName, /*FileMapOnly*/ !IsImportingStdCXXModule);
+
+      if (IsImportingStdCXXModule && ImportedFile.empty()) {
+        Diag(diag::err_failed_to_find_module_file) << ImportedName;
+        return Missing;
+      }
 
-        std::string ImportedName = ReadString(Record, Idx);
-        std::string ImportedFile;
-
-        // For prebuilt and explicit modules first consult the file map for
-        // an override. Note that here we don't search prebuilt module
-        // directories if we're not importing standard c++ module, only the
-        // explicit name to file mappings. Also, we will still verify the
-        // size/signature making sure it is essentially the same file but
-        // perhaps in a different location.
-        if (ImportedKind == MK_PrebuiltModule || ImportedKind == MK_ExplicitModule)
-          ImportedFile = PP.getHeaderSearchInfo().getPrebuiltModuleFileName(
-              ImportedName, /*FileMapOnly*/ !IsImportingStdCXXModule);
-
-        // For C++20 Modules, we won't record the path to the imported modules
-        // in the BMI
-        if (!IsImportingStdCXXModule) {
-          if (ImportedFile.empty()) {
-            // Use BaseDirectoryAsWritten to ensure we use the same path in the
-            // ModuleCache as when writing.
-            ImportedFile = ReadPath(BaseDirectoryAsWritten, Record, Idx);
-          } else
-            SkipPath(Record, Idx);
-        } else if (ImportedFile.empty()) {
-          Diag(clang::diag::err_failed_to_find_module_file) << ImportedName;
-          return Missing;
-        }
+      if (!IsImportingStdCXXModule) {
+        StoredSize = (off_t)Record[Idx++];
+        StoredModTime = (time_t)Record[Idx++];
 
-        // If our client can't cope with us being out of date, we can't cope with
-        // our dependency being missing.
-        unsigned Capabilities = ClientLoadCapabilities;
-        if ((ClientLoadCapabilities & ARR_OutOfDate) == 0)
-          Capabilities &= ~ARR_Missing;
-
-        // Load the AST file.
-        auto Result = ReadASTCore(ImportedFile, ImportedKind, ImportLoc, &F,
-                                  Loaded, StoredSize, StoredModTime,
-                                  StoredSignature, Capabilities);
-
-        // If we diagnosed a problem, produce a backtrace.
-        bool recompilingFinalized =
-            Result == OutOfDate && (Capabilities & ARR_OutOfDate) &&
-            getModuleManager().getModuleCache().isPCMFinal(F.FileName);
-        if (isDiagnosedResult(Result, Capabilities) || recompilingFinalized)
-          Diag(diag::note_module_file_imported_by)
-              << F.FileName << !F.ModuleName.empty() << F.ModuleName;
-        if (recompilingFinalized)
-          Diag(diag::note_module_file_conflict);
-
-        switch (Result) {
-        case Failure: return Failure;
-          // If we have to ignore the dependency, we'll have to ignore this too.
-        case Missing:
-        case OutOfDate: return OutOfDate;
-        case VersionMismatch: return VersionMismatch;
-        case ConfigurationMismatch: return ConfigurationMismatch;
-        case HadErrors: return HadErrors;
-        case Success: break;
+        StringRef SignatureBytes = Blob.substr(0, ASTFileSignature::size);
+        StoredSignature = ASTFileSignature::create(SignatureBytes.begin(),
+                                                   SignatureBytes.end());
+        Blob = Blob.substr(ASTFileSignature::size);
+
+        if (ImportedFile.empty()) {
+          // Use BaseDirectoryAsWritten to ensure we use the same path in the
+          // ModuleCache as when writing.
+          ImportedFile =
+              ReadPathBlob(BaseDirectoryAsWritten, Record, Idx, Blob);
         }
       }
+
+      // If our client can't cope with us being out of date, we can't cope with
+      // our dependency being missing.
+      unsigned Capabilities = ClientLoadCapabilities;
+      if ((ClientLoadCapabilities & ARR_OutOfDate) == 0)
+        Capabilities &= ~ARR_Missing;
+
+      // Load the AST file.
+      auto Result = ReadASTCore(ImportedFile, ImportedKind, ImportLoc, &F,
+                                Loaded, StoredSize, StoredModTime,
+                                StoredSignature, Capabilities);
+
+      // If we diagnosed a problem, produce a backtrace.
+      bool recompilingFinalized =
+          Result == OutOfDate && (Capabilities & ARR_OutOfDate) &&
+          getModuleManager().getModuleCache().isPCMFinal(F.FileName);
+      if (isDiagnosedResult(Result, Capabilities) || recompilingFinalized)
+        Diag(diag::note_module_file_imported_by)
+            << F.FileName << !F.ModuleName.empty() << F.ModuleName;
+      if (recompilingFinalized)
+        Diag(diag::note_module_file_conflict);
+
+      switch (Result) {
+      case Failure: return Failure;
+        // If we have to ignore the dependency, we'll have to ignore this too.
+      case Missing:
+      case OutOfDate: return OutOfDate;
+      case VersionMismatch: return VersionMismatch;
+      case ConfigurationMismatch: return ConfigurationMismatch;
+      case HadErrors: return HadErrors;
+      case Success: break;
+      }
       break;
     }
 
@@ -5624,36 +5623,38 @@ bool ASTReader::readASTFileControlBlock(
       break;
     }
 
-    case IMPORTS: {
+    case IMPORT: {
       if (!NeedsImports)
         break;
 
-      unsigned Idx = 0, N = Record.size();
-      while (Idx < N) {
-        // Read information about the AST file.
+      unsigned Idx = 0;
+      // Read information about the AST file.
+
+      // Skip Kind
+      Idx++;
 
-        // Skip Kind
-        Idx++;
-        bool IsStandardCXXModule = Record[Idx++];
+      // Skip ImportLoc
+      Idx++;
 
-        // Skip ImportLoc
-        Idx++;
+      StringRef ModuleName = ReadStringBlob(Record, Idx, Blob);
 
-        // In C++20 Modules, we don't record the path to imported
-        // modules in the BMI files.
-        if (IsStandardCXXModule) {
-          std::string ModuleName = ReadString(Record, Idx);
-          Listener.visitImport(ModuleName, /*Filename=*/"");
-          continue;
-        }
+      bool IsStandardCXXModule = Record[Idx++];
 
-        // Skip Size, ModTime and Signature
-        Idx += 1 + 1 + ASTFileSignature::size;
-        std::string ModuleName = ReadString(Record, Idx);
-        std::string FilenameStr = ReadString(Record, Idx);
-        auto Filename = ResolveImportedPath(PathBuf, FilenameStr, ModuleDir);
-        Listener.visitImport(ModuleName, *Filename);
+      // In C++20 Modules, we don't record the path to imported
+      // modules in the BMI files.
+      if (IsStandardCXXModule) {
+        Listener.visitImport(ModuleName, /*Filename=*/"");
+        continue;
       }
+
+      // Skip Size and ModTime.
+      Idx += 1 + 1;
+      // Skip signature.
+      Blob = Blob.substr(ASTFileSignature::size);
+
+      StringRef FilenameStr = ReadStringBlob(Record, Idx, Blob);
+      auto Filename = ResolveImportedPath(PathBuf, FilenameStr, ModuleDir);
+      Listener.visitImport(ModuleName, *Filename);
       break;
     }
 
@@ -9602,6 +9603,14 @@ std::string ASTReader::ReadString(const RecordDataImpl &Record, unsigned &Idx) {
   return Result;
 }
 
+StringRef ASTReader::ReadStringBlob(const RecordDataImpl &Record, unsigned &Idx,
+                                    StringRef &Blob) {
+  unsigned Len = Record[Idx++];
+  StringRef Result = Blob.substr(0, Len);
+  Blob = Blob.substr(Len);
+  return Result;
+}
+
 std::string ASTReader::ReadPath(ModuleFile &F, const RecordData &Record,
                                 unsigned &Idx) {
   return ReadPath(F.BaseDirectory, Record, Idx);
@@ -9613,6 +9622,13 @@ std::string ASTReader::ReadPath(StringRef BaseDirectory,
   return ResolveImportedPathAndAllocate(PathBuf, Filename, BaseDirectory);
 }
 
+std::string ASTReader::ReadPathBlob(StringRef BaseDirectory,
+                                    const RecordData &Record, unsigned &Idx,
+                                    StringRef &Blob) {
+  StringRef Filename = ReadStringBlob(Record, Idx, Blob);
+  return ResolveImportedPathAndAllocate(PathBuf, Filename, BaseDirectory);
+}
+
 VersionTuple ASTReader::ReadVersionTuple(const RecordData &Record,
                                          unsigned &Idx) {
   unsigned Major = Record[Idx++];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 88b3e649a5d46..a52d59c61c4ce 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -878,7 +878,7 @@ void ASTWriter::WriteBlockInfoBlock() {
   RECORD(MODULE_NAME);
   RECORD(MODULE_DIRECTORY);
   RECORD(MODULE_MAP_FILE);
-  RECORD(IMPORTS);
+  RECORD(IMPORT);
   RECORD(ORIGINAL_FILE);
   RECORD(ORIGINAL_FILE_ID);
   RECORD(INPUT_FILE_OFFSETS);
@@ -1536,34 +1536,53 @@ void ASTWriter::WriteControlBlock(Preprocessor &PP, StringRef isysroot) {
 
   // Imports
   if (Chain) {
-    serialization::ModuleManager &Mgr = Chain->getModuleManager();
-    Record.clear();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(IMPORT));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Kind
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ImportLoc
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Module name len
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Standard C++ mod
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // File size
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // File timestamp
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // File name len
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Strings
+    unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev));
 
-    for (ModuleFile &M : Mgr) {
+    SmallString<128> Blob;
+
+    for (ModuleFile &M : Chain->getModuleManager()) {
       // Skip modules that weren't directly imported.
       if (!M.isDirectlyImported())
         continue;
 
+      Record.clear();
+      Blob.clear();
+
+      Record.push_back(IMPORT);
       Record.push_back((unsigned)M.Kind); // FIXME: Stable encoding
-      Record.push_back(M.StandardCXXModule);
       AddSourceLocation(M.ImportLoc, Record);
+      AddStringBlob(M.ModuleName, Record, Blob);
+      Record.push_back(M.StandardCXXModule);
 
       // We don't want to hard code the information about imported modules
       // in the C++20 named modules.
-      if (!M.StandardCXXModule) {
+      if (M.StandardCXXModule) {
+        Record.push_back(0);
+        Record.push_back(0);
+        Record.push_back(0);
+      } else {
         // If we have calculated signature, there is no need to store
         // the size or timestamp.
         Record.push_back(M.Signature ? 0 : M.File.getSize());
         Record.push_back(M.Signature ? 0 : getTimestampForOutput(M.File));
-        llvm::append_range(Record, M.Signature);
-      }
 
-      AddString(M.ModuleName, Record);
+        llvm::append_range(Blob, M.Signature);
 
-      if (!M.StandardCXXModule)
-        AddPath(M.FileName, Record);
+        AddPathBlob(M.FileName, Record, Blob);
+      }
+
+      Stream.EmitRecordWithBlob(AbbrevCode, Record, Blob);
     }
-    Stream.EmitRecord(IMPORTS, Record);
   }
 
   // Write the options block.
@@ -4777,6 +4796,12 @@ void ASTWriter::AddString(StringRef Str, RecordDataImpl &Record) {
   Record.insert(Record.end(), Str.begin(), Str.end());
 }
 
+void ASTWriter::AddStringBlob(StringRef Str, RecordDataImpl &Record,
+                              SmallVectorImpl<char> &Blob) {
+  Record.push_back(Str.size());
+  Blob.insert(Blob.end(), Str.begin(), Str.end());
+}
+
 bool ASTWriter::PreparePathForOutput(SmallVectorImpl<char> &Path) {
   assert(WritingAST && "can't prepare path for output when not writing AST");
 
@@ -4805,6 +4830,13 @@ void ASTWriter::AddPath(StringRef Path, RecordDataImpl &Record) {
   AddString(FilePath, Record);
 }
 
+void ASTWriter::AddPathBlob(StringRef Path, RecordDataImpl &Record,
+                            SmallVectorImpl<char> &Blob) {
+  SmallString<128> FilePath(Path);
+  PreparePathForOutput(FilePath);
+  AddStringBlob(FilePath, Record, Blob);
+}
+
 void ASTWriter::EmitRecordWithPath(unsigned Abbrev, RecordDataRef Record,
                                    StringRef Path) {
   SmallString<128> FilePath(Path);
diff --git a/clang/lib/Serialization/GlobalModuleIndex.cpp b/clang/lib/Serialization/GlobalModuleIndex.cpp
index 9c48712a0b3fb..4b920fccecac3 100644
--- a/clang/lib/Serialization/GlobalModuleIndex.cpp
+++ b/clang/lib/Serialization/GlobalModuleIndex.cpp
@@ -614,62 +614,58 @@ llvm::Error GlobalModuleIndexBuilder::loadModuleFile(FileEntryRef File) {
     unsigned Code = MaybeCode.get();
 
     // Handle module dependencies.
-    if (State == ControlBlock && Code == IMPORTS) {
-      // Load each of the imported PCH files.
-      unsigned Idx = 0, N = Record.size();
-      while (Idx < N) {
-        // Read information about the AST file.
-
-        // Skip the imported kind
-        ++Idx;
-
-        // Skip if it is standard C++ module
-        ++Idx;
-
-        // Skip the import location
-        ++Idx;
-
-        // Load stored size/modification time.
-        off_t StoredSize = (off_t)Record[Idx++];
-        time_t StoredModTime = (time_t)Record[Idx++];
-
-        // Skip the stored signature.
-        // FIXME: we could read the signature out of the import and validate it.
-        auto FirstSignatureByte = Record.begin() + Idx;
-        ASTFileSignature StoredSignature = ASTFileSignature::create(
-            FirstSignatureByte, FirstSignatureByte + ASTFileSignature::size);
-        Idx += ASTFileSignature::size;
-
-        // Skip the module name (currently this is only used for prebuilt
-        // modules while here we are only dealing with cached).
-        Idx += Record[Idx] + 1;
-
-        // Retrieve the imported file name.
-        unsigned Length = Record[Idx++];
-        SmallString<128> ImportedFile(Record.begin() + Idx,
-                                      Record.begin() + Idx + Length);
-        Idx += Length;
-
-        // Find the imported module file.
-        auto DependsOnFile =
-            FileMgr.getOptionalFileRef(ImportedFile, /*OpenFile=*/false,
-                                       /*CacheFailure=*/false);
-
-        if (!DependsOnFile)
-          return llvm::createStringError(std::errc::bad_file_descriptor,
-                                         "imported file \"%s\" not found",
-                                         ImportedFile.c_str());
-
-        // Save the information in ImportedModuleFileInfo so we can verify after
-        // loading all pcms.
-        ImportedModuleFiles.insert(std::make_pair(
-            *DependsOnFile, ImportedModuleFileInfo(StoredSize, StoredModTime,
-                                                   StoredSignature)));
-
-        // Record the dependency.
-        unsigned DependsOnID = getModuleFileInfo(*DependsOnFile).ID;
-        getModuleFileInfo(File).Dependencies.push_back(DependsOnID);
-      }
+    if (State == ControlBlock && Code == IMPORT) {
+      unsigned Idx = 0;
+      // Read information about the AST file.
+
+      // Skip the imported kind
+      ++Idx;
+
+      // Skip the import location
+      ++Idx;
+
+      // Skip the module name (currently this is only used for prebuilt
+      // modules while here we are only dealing with cached).
+      Blob = Blob.substr(Record[Idx++]);
+
+      // Skip if it is standard C++ module
+      ++Idx;
+
+      // Load stored size/modification time.
+      off_t StoredSize = (off_t)Record[Idx++];
+      time_t StoredModTime = (time_t)Record[Idx++];
+
+      // Skip the stored signature.
+      // FIXME: we could read the signature out of the import and validate it.
+      StringRef SignatureBytes = Blob.substr(0, ASTFileSignature::size);
+      auto StoredSignature = ASTFileSignature::create(SignatureBytes.begin(),
+                                                      SignatureBytes.end());
+      Blob = Blob.substr(ASTFileSignature::size);
+
+      // Retrieve the imported file name.
+      unsigned Length = Record[Idx++];
+      StringRef ImportedFile = Blob.substr(0, Length);
+      Blob = Blob.substr(Length);
+
+      // Find the imported module file.
+      auto DependsOnFile =
+          FileMgr.getOptionalFileRef(ImportedFile, /*OpenFile=*/false,
+                                     /*CacheFailure=*/false);
+
+      if (!DependsOnFile)
+        return llvm::createStringError(std::errc::bad_file_descriptor,
+                                       "imported file \"%s\" not found",
+                                       std::string(ImportedFile).c_str());
+
+      // Save the information in ImportedModuleFileInfo so we can verify after
+      // loading all pcms.
+      ImportedModuleFiles.insert(std::make_pair(
+          *DependsOnFile, ImportedModuleFileInfo(StoredSize, StoredModTime,
+                                                 StoredSignature)));
+
+      // Record the dependency.
+      unsigned DependsOnID = getModuleFileInfo(*DependsOnFile).ID;
+      getModuleFileInfo(File).Dependencies.push_back(DependsOnID);
 
       continue;
     }
diff --git a/clang/test/AST/HLSL/AppendStructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/AppendStructuredBuffer-AST.hlsl
index 5a13ca7735f99..8c951e9829211 100644
--- a/clang/test/AST/HLSL/AppendStructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/AppendStructuredBuffer-AST.hlsl
@@ -30,20 +30,20 @@ AppendStructuredBuffer<int> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class AppendStructuredBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
 
-// CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
 // CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
 
 // CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class AppendStructuredBuffer definition
 // CHECK: TemplateArgument type 'int'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]]
diff --git a/clang/test/AST/HLSL/ConsumeStructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/ConsumeStructuredBuffer-AST.hlsl
index b75f3fcb959cf..86e3d387883dc 100644
--- a/clang/test/AST/HLSL/ConsumeStructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/ConsumeStructuredBuffer-AST.hlsl
@@ -30,13 +30,13 @@ ConsumeStructuredBuffer<int> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class ConsumeStructuredBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
 
-// CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
 // CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
 
 // CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class ConsumeStructuredBuffer definition
@@ -44,7 +44,7 @@ ConsumeStructuredBuffer<int> Buffer;
 // CHECK: TemplateArgument type 'int'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]]
diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
index ebddd72ddb1e0..f2eba75481fd5 100644
--- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
@@ -29,12 +29,12 @@ RWBuffer<float> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class RWBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
 
-// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
@@ -55,7 +55,7 @@ RWBuffer<float> Buffer;
 // CHECK: TemplateArgument type 'float'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] 
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl
index 4a1e1d7570e5e..cc10b41b7c2b0 100644
--- a/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl
@@ -30,13 +30,13 @@ RWStructuredBuffer<int> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class RWStructuredBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
 
-// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
@@ -57,7 +57,7 @@ RWStructuredBuffer<int> Buffer;
 // CHECK: TemplateArgument type 'int'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]]
diff --git a/clang/test/AST/HLSL/RasterizerOrderedStructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/RasterizerOrderedStructuredBuffer-AST.hlsl
index f334e1bb6db3f..1aac67b5ced5b 100644
--- a/clang/test/AST/HLSL/RasterizerOrderedStructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RasterizerOrderedStructuredBuffer-AST.hlsl
@@ -30,14 +30,14 @@ RasterizerOrderedStructuredBuffer<int> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class RasterizerOrderedStructuredBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
 
-// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
@@ -58,9 +58,9 @@ RasterizerOrderedStructuredBuffer<int> Buffer;
 // CHECK: TemplateArgument type 'int'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]]
-// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
index 521c3d45b2022..95ae20ead32bf 100644
--- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
@@ -30,13 +30,13 @@ StructuredBuffer<float> Buffer;
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class StructuredBuffer definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
 
-// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
 // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
 // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
@@ -57,7 +57,7 @@ StructuredBuffer<float> Buffer;
 // CHECK: TemplateArgument type 'float'
 // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
 // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
index 693ca29370cf3..da1f8201f55dc 100644
--- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
@@ -17,10 +17,10 @@ E get_e();
 // cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}}
 
 void f() {
-  get_s(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  get_s(); // expected-warning {{ignoring return value of type 'S' declared with 'nodiscard' attribute}}
   get_i(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   get_vi(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  get_e(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  get_e(); // expected-warning {{ignoring return value of type 'E' declared with 'nodiscard' attribute}}
 
   // Okay, warnings are not encouraged
   get_s_ref();
@@ -54,10 +54,10 @@ void f() {
   fp3 three;
   fp2_alias four;
 
-  one(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  two(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  three(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  four(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  one(); // expected-warning {{ignoring return value of type 'E' declared with 'nodiscard' attribute}}
+  two(); // expected-warning {{ignoring return value of type 'S' declared with 'nodiscard' attribute}}
+  three(); // expected-warning {{ignoring return value of type 'S' declared with 'nodiscard' attribute}}
+  four(); // expected-warning {{ignoring return value of type 'S' declared with 'nodiscard' attribute}}
 
   // These are all okay because of the explicit cast to void.
   (void)one();
@@ -84,8 +84,8 @@ LaterReason get_later_reason();
 // cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}}
 
 void cxx20_use() {
-  get_reason(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: reason}}
-  get_later_reason(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: later reason}}
+  get_reason(); // expected-warning {{ignoring return value of type 'ReasonStruct' declared with 'nodiscard' attribute: reason}}
+  get_later_reason(); // expected-warning {{ignoring return value of type 'LaterReason' declared with 'nodiscard' attribute: later reason}}
   another_reason(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: another reason}}
   conflicting_reason(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: special reason}}
 }
@@ -115,20 +115,20 @@ void usage() {
   S('A'); // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't let that S-Char go!}}
   S(1);
   S(2.2);
-  Y(); // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't throw me away either!}}
+  Y(); // expected-warning {{ignoring temporary of type 'Y' declared with 'nodiscard' attribute: Don't throw me away either!}}
   S s;
-  ConvertTo{}; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw me away!}}
+  ConvertTo{}; // expected-warning {{ignoring return value of type 'ConvertTo' declared with 'nodiscard' attribute: Don't throw me away!}}
 
   // AST is different in C++17 mode. Before, a move ctor for ConvertTo is there
   // as well, hence the constructor warning.
 
-  // since-cxx17-warning@+2 {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw me away!}}
-  // cxx11-warning@+1 {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't throw me away!}}
+  // since-cxx17-warning@+2 {{ignoring return value of type 'ConvertTo' declared with 'nodiscard' attribute: Don't throw me away!}}
+  // cxx11-warning@+1 {{ignoring temporary of type 'ConvertTo' declared with 'nodiscard' attribute: Don't throw me away!}}
   (ConvertTo) s;
   (int)s; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   (S)'c'; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't let that S-Char go!}}
-  // since-cxx17-warning@+2 {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw me away!}}
-  // cxx11-warning@+1 {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't throw me away!}}
+  // since-cxx17-warning@+2 {{ignoring return value of type 'ConvertTo' declared with 'nodiscard' attribute: Don't throw me away!}}
+  // cxx11-warning@+1 {{ignoring temporary of type 'ConvertTo' declared with 'nodiscard' attribute: Don't throw me away!}}
   static_cast<ConvertTo>(s);
   static_cast<int>(s); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   static_cast<double>(s); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw away as a double}}
diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp
index a3543cff7d2c9..b37517921b1ca 100644
--- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp
@@ -8,7 +8,7 @@ namespace std_example {
   error_info enable_missile_safety_mode();
   void launch_missiles();
   void test_missiles() {
-    enable_missile_safety_mode(); // expected-warning {{ignoring return value of function declared with 'nodiscard'}}
+    enable_missile_safety_mode(); // expected-warning {{ignoring return value of type 'error_info' declared with 'nodiscard'}}
     launch_missiles();
   }
 
diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p2-cxx0x.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p2-cxx0x.cpp
index ce90728861605..881742df7e8b2 100644
--- a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p2-cxx0x.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p2-cxx0x.cpp
@@ -1,7 +1,18 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s 
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s
+// RUN: not %clang_cc1 -fsyntax-only -std=c++11 -fno-diagnostics-show-line-numbers -fcaret-diagnostics-max-lines=1 %s 2>&1 | FileCheck %s -strict-whitespace
 
 auto a() -> int; // ok
 const auto b() -> int; // expected-error {{function with trailing return type must specify return type 'auto', not 'const auto'}}
 auto *c() -> int; // expected-error {{function with trailing return type must specify return type 'auto', not 'auto *'}}
 auto (d() -> int); // expected-error {{trailing return type may not be nested within parentheses}}
 auto e() -> auto (*)() -> auto (*)() -> void; // ok: same as void (*(*e())())();
+
+namespace GH78694 {
+
+template <typename T> struct B {
+  // CHECK:      error: function with trailing return type must specify return type 'auto', not 'void'
+  // CHECK-NEXT: {{^}}  template <class U> B(U) -> B<int>;
+  // CHECK-NEXT: {{^}}                     ~~~~~~~~^~~~~~{{$}}
+  template <class U> B(U) -> B<int>; // expected-error {{function with trailing return type must specify return type 'auto', not 'void'}}
+};
+}
diff --git a/clang/test/CodeGen/AArch64/elf-pauthabi.c b/clang/test/CodeGen/AArch64/elf-pauthabi.c
index 023fa8c18e130..b176f708db85b 100644
--- a/clang/test/CodeGen/AArch64/elf-pauthabi.c
+++ b/clang/test/CodeGen/AArch64/elf-pauthabi.c
@@ -1,5 +1,3 @@
-//// TODO: also test with -fptrauth-elf-got when the driver flag is supported
-
 // RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
 // RUN:   -fptrauth-intrinsics \
 // RUN:   -fptrauth-calls \
@@ -9,6 +7,7 @@
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fptrauth-init-fini \
 // RUN:   -fptrauth-init-fini-address-discrimination \
+// RUN:   -fptrauth-elf-got \
 // RUN:   -fptrauth-indirect-gotos \
 // RUN:   -fptrauth-type-info-vtable-pointer-discrimination \
 // RUN:   -fptrauth-function-pointer-type-discrimination %s | \
@@ -42,6 +41,9 @@
 // RUN:   -fptrauth-calls -fptrauth-init-fini -fptrauth-init-fini-address-discrimination %s | \
 // RUN:   FileCheck %s --check-prefix=INITFINIADDR
 
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-elf-got %s | FileCheck %s --check-prefix=ELFGOT
+
 // RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
 // RUN:   -fptrauth-indirect-gotos %s | FileCheck %s --check-prefix=GOTOS
 
@@ -54,7 +56,7 @@
 // RUN:   FileCheck %s --check-prefix=FPTRTYPE
 
 // ALL: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
-// ALL: !{i32 1, !"aarch64-elf-pauthabi-version", i32 3839}
+// ALL: !{i32 1, !"aarch64-elf-pauthabi-version", i32 4095}
 
 // INTRIN: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
 // INTRIN: !{i32 1, !"aarch64-elf-pauthabi-version", i32 1}
@@ -80,6 +82,9 @@
 // INITFINIADDR: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
 // INITFINIADDR: !{i32 1, !"aarch64-elf-pauthabi-version", i32 194}
 
+// ELFGOT: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// ELFGOT: !{i32 1, !"aarch64-elf-pauthabi-version", i32 256}
+
 // GOTOS: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
 // GOTOS: !{i32 1, !"aarch64-elf-pauthabi-version", i32 512}
 
diff --git a/clang/test/CodeGen/PowerPC/musttail-forward-declaration-inline.c b/clang/test/CodeGen/PowerPC/musttail-forward-declaration-inline.c
index 3d8ff3985cb0f..d0ec21209582e 100644
--- a/clang/test/CodeGen/PowerPC/musttail-forward-declaration-inline.c
+++ b/clang/test/CodeGen/PowerPC/musttail-forward-declaration-inline.c
@@ -3,7 +3,7 @@
 
 inline int func2(int i);
 int external_call2(int i) {
-  // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}}
+  // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}}
   [[clang::musttail]] return func2(i);
 }
 
diff --git a/clang/test/CodeGen/PowerPC/musttail-forward-declaration-weak.c b/clang/test/CodeGen/PowerPC/musttail-forward-declaration-weak.c
index 4314bbdd30619..57226d2109f32 100644
--- a/clang/test/CodeGen/PowerPC/musttail-forward-declaration-weak.c
+++ b/clang/test/CodeGen/PowerPC/musttail-forward-declaration-weak.c
@@ -3,7 +3,7 @@
 
 int func2(int i);
 int external_call2(int i) {
-  // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}}
+  // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}}
   [[clang::musttail]] return func2(i);
 }
 
diff --git a/clang/test/CodeGen/PowerPC/musttail-indirect.cpp b/clang/test/CodeGen/PowerPC/musttail-indirect.cpp
index 3f495002606d4..cc506d4f7bc1f 100644
--- a/clang/test/CodeGen/PowerPC/musttail-indirect.cpp
+++ b/clang/test/CodeGen/PowerPC/musttail-indirect.cpp
@@ -3,6 +3,6 @@
 
 void name(int *params) {
   auto fn = (void (*)(int *))1;
-  // expected-error@+1 {{'musttail' attribute for this call is impossible because indirect calls can not be tail called on PPC}}
+  // expected-error@+1 {{'musttail' attribute for this call is impossible because indirect calls cannot be tail called on PPC}}
   [[clang::musttail]] return fn(params);
 }
diff --git a/clang/test/CodeGen/PowerPC/musttail-inline.c b/clang/test/CodeGen/PowerPC/musttail-inline.c
index 05aac88697127..1ac841f088cf5 100644
--- a/clang/test/CodeGen/PowerPC/musttail-inline.c
+++ b/clang/test/CodeGen/PowerPC/musttail-inline.c
@@ -7,6 +7,6 @@ inline int foo(int x) {
 
 int bar(int x)
 {
-  // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}}
+  // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}}
   [[clang::musttail]] return foo(1);
 }
diff --git a/clang/test/CodeGen/PowerPC/musttail-undefined.c b/clang/test/CodeGen/PowerPC/musttail-undefined.c
index f2259adb01848..fb3845218a622 100644
--- a/clang/test/CodeGen/PowerPC/musttail-undefined.c
+++ b/clang/test/CodeGen/PowerPC/musttail-undefined.c
@@ -5,6 +5,6 @@ int foo(int x);
 
 int bar(int x)
 {
-  // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}}
+  // expected-error@+1 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}}
   [[clang::musttail]] return foo(x);
 }
diff --git a/clang/test/CodeGen/PowerPC/musttail-weak.c b/clang/test/CodeGen/PowerPC/musttail-weak.c
index dccc7a4d8cdd2..1070b91bc5f35 100644
--- a/clang/test/CodeGen/PowerPC/musttail-weak.c
+++ b/clang/test/CodeGen/PowerPC/musttail-weak.c
@@ -7,7 +7,7 @@ __attribute__((weak)) int func2(int i) {
   return 0;
 }
 int external_call2(int i) {
-  // linux-error@+2 {{'musttail' attribute for this call is impossible because external calls can not be tail called on PPC}}
+  // linux-error@+2 {{'musttail' attribute for this call is impossible because external calls cannot be tail called on PPC}}
   // aix-error@+1 {{'musttail' attribute is not supported on AIX}}
   [[clang::musttail]] return func2(i);
 }
diff --git a/clang/test/CodeGen/PowerPC/musttail.c b/clang/test/CodeGen/PowerPC/musttail.c
index e3129263d2460..7a74d084c67be 100644
--- a/clang/test/CodeGen/PowerPC/musttail.c
+++ b/clang/test/CodeGen/PowerPC/musttail.c
@@ -14,7 +14,7 @@ int foo(int x) {
 int bar(int x)
 {
   // good-no-diagnostics
-  // longcall-error@+2 {{'musttail' attribute for this call is impossible because long calls can not be tail called on PPC}}
+  // longcall-error@+2 {{'musttail' attribute for this call is impossible because long calls cannot be tail called on PPC}}
   // aix-error@+1 {{'musttail' attribute is not supported on AIX}}
  [[clang::musttail]] return foo(1);
 }
diff --git a/clang/test/CodeGen/RISCV/riscv-inline-asm.c b/clang/test/CodeGen/RISCV/riscv-inline-asm.c
index 75b91d3c497c5..de90e513ea1ff 100644
--- a/clang/test/CodeGen/RISCV/riscv-inline-asm.c
+++ b/clang/test/CodeGen/RISCV/riscv-inline-asm.c
@@ -33,6 +33,19 @@ void test_cf(float f, double d) {
   asm volatile("" : "=cf"(cd) : "cf"(d));
 }
 
+#if __riscv_xlen == 32
+typedef long long double_xlen_t;
+#elif __riscv_xlen == 64
+typedef __int128_t double_xlen_t;
+#endif
+double_xlen_t test_R_wide_scalar(double_xlen_t p) {
+// CHECK-LABEL: define{{.*}} {{i128|i64}} @test_R_wide_scalar(
+// CHECK: call {{i128|i64}} asm sideeffect "", "=R,R"({{i128|i64}} %{{.*}})
+  double_xlen_t ret;
+  asm volatile("" : "=R"(ret) : "R"(p));
+  return ret;
+}
+
 void test_I(void) {
 // CHECK-LABEL: define{{.*}} void @test_I()
 // CHECK: call void asm sideeffect "", "I"(i32 2047)
diff --git a/clang/test/CodeGen/X86/x86_64-PR42672.c b/clang/test/CodeGen/X86/x86_64-PR42672.c
index 6fe612d0aabdb..42894c0c4cb57 100644
--- a/clang/test/CodeGen/X86/x86_64-PR42672.c
+++ b/clang/test/CodeGen/X86/x86_64-PR42672.c
@@ -58,7 +58,7 @@ void odd_struct(void) {
       : "=r"(str));
 #endif
 }
-// CHECK-IMPOSSIBLE_ODD: impossible constraint in asm: can't store value into a register
+// CHECK-IMPOSSIBLE_ODD: impossible constraint in asm: cannot store value into a register
 
 // Check Clang reports an error if attempting to return a big structure via a register.
 void big_struct(void) {
@@ -70,7 +70,7 @@ void big_struct(void) {
       : "=r"(str));
 #endif
 }
-// CHECK-IMPOSSIBLE_BIG: impossible constraint in asm: can't store value into a register
+// CHECK-IMPOSSIBLE_BIG: impossible constraint in asm: cannot store value into a register
 
 // Clang is able to emit LLVM IR for an 16-byte structure.
 void x_constraint_fit(void) {
@@ -103,7 +103,7 @@ void x_constraint_nofit(void) {
 
 // http://crbug.com/999160
 // Clang used to report the following message:
-//   "impossible constraint in asm: can't store struct into a register"
+//   "impossible constraint in asm: cannot store struct into a register"
 // for the assembly directive below, although there's no struct.
 void crbug_999160_regtest(void) {
 #ifdef IMPOSSIBLE_9BYTES
@@ -113,7 +113,7 @@ void crbug_999160_regtest(void) {
 #endif
 }
 
-// CHECK-IMPOSSIBLE_9BYTES: impossible constraint in asm: can't store value into a register
+// CHECK-IMPOSSIBLE_9BYTES: impossible constraint in asm: cannot store value into a register
 
 void crbug_999160_regtest_v2(void) {
 #ifdef IMPOSSIBLE_9BYTES_V2
@@ -121,4 +121,4 @@ void crbug_999160_regtest_v2(void) {
   asm("" : "=r"(buf) : "0"(buf));
 #endif
 }
-// CHECK-IMPOSSIBLE_9BYTES_V2: impossible constraint in asm: can't store value into a register
+// CHECK-IMPOSSIBLE_9BYTES_V2: impossible constraint in asm: cannot store value into a register
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/compare.c b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
index 8f190990a6586..8886cf5c10058 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/compare.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
@@ -2376,7 +2376,7 @@ mve_pred16_t test_vcmphiq_m_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vcmpleq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2392,7 +2392,7 @@ mve_pred16_t test_vcmpleq_f16(float16x8_t a, float16x8_t b)
 
 // CHECK-LABEL: @test_vcmpleq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2458,7 +2458,7 @@ mve_pred16_t test_vcmpleq_s32(int32x4_t a, int32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2476,7 +2476,7 @@ mve_pred16_t test_vcmpleq_n_f16(float16x8_t a, float16_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2548,7 +2548,7 @@ mve_pred16_t test_vcmpleq_n_s32(int32x4_t a, int32_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2567,7 +2567,7 @@ mve_pred16_t test_vcmpleq_m_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2645,7 +2645,7 @@ mve_pred16_t test_vcmpleq_m_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2666,7 +2666,7 @@ mve_pred16_t test_vcmpleq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2746,7 +2746,7 @@ mve_pred16_t test_vcmpleq_m_n_s32(int32x4_t a, int32_t b, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vcmpltq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2762,7 +2762,7 @@ mve_pred16_t test_vcmpltq_f16(float16x8_t a, float16x8_t b)
 
 // CHECK-LABEL: @test_vcmpltq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2828,7 +2828,7 @@ mve_pred16_t test_vcmpltq_s32(int32x4_t a, int32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2846,7 +2846,7 @@ mve_pred16_t test_vcmpltq_n_f16(float16x8_t a, float16_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2918,7 +2918,7 @@ mve_pred16_t test_vcmpltq_n_s32(int32x4_t a, int32_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2937,7 +2937,7 @@ mve_pred16_t test_vcmpltq_m_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -3015,7 +3015,7 @@ mve_pred16_t test_vcmpltq_m_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -3036,7 +3036,7 @@ mve_pred16_t test_vcmpltq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
diff --git a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c
new file mode 100644
index 0000000000000..df7118859c764
--- /dev/null
+++ b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -fcuda-is-device -fembed-bitcode=marker -x hip %s -o - \
+// RUN:   | FileCheck %s --check-prefix=CHECK
+
+// CHECK: @llvm.embedded.module = private addrspace(1) constant [0 x i8] zeroinitializer, section ".llvmbc", align 1
+// CHECK-NEXT: @llvm.cmdline = private addrspace(1) constant [{{[0-9]+}} x i8] c"{{.*}}", section ".llvmcmd", align 1
+// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo.managed to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr addrspace(4))], section "llvm.metadata"
+
+__attribute__((managed)) int foo = 42;
diff --git a/clang/test/CodeGen/ptrauth-module-flags.c b/clang/test/CodeGen/ptrauth-module-flags.c
new file mode 100644
index 0000000000000..5a7e9a7c2a36f
--- /dev/null
+++ b/clang/test/CodeGen/ptrauth-module-flags.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu                   -emit-llvm %s  -o - | FileCheck %s --check-prefix=OFF
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-elf-got -emit-llvm %s  -o - | FileCheck %s --check-prefix=ELFGOT
+
+// ELFGOT:      !llvm.module.flags = !{
+// ELFGOT-SAME: !1
+// ELFGOT:      !1 = !{i32 8, !"ptrauth-elf-got", i32 1}
+
+// OFF-NOT: "ptrauth-
diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c
new file mode 100644
index 0000000000000..376cb11e84d3d
--- /dev/null
+++ b/clang/test/CodeGen/scoped-fence-ops.c
@@ -0,0 +1,257 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
+// RUN:   -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
+// RUN:   -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-unknown-linux-gnu -ffreestanding \
+// RUN:   -fvisibility=hidden | FileCheck --check-prefix=X86_64 %s
+
+// AMDGCN-LABEL: define hidden void @fe1a(
+// AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    ret void
+//
+// SPIRV-LABEL: define hidden spir_func void @fe1a(
+// SPIRV-SAME: ) #[[ATTR0:[0-9]+]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    fence syncscope("workgroup") release
+// SPIRV-NEXT:    ret void
+//
+// X86_64-LABEL: define hidden void @fe1a(
+// X86_64-SAME: ) #[[ATTR0:[0-9]+]] {
+// X86_64-NEXT:  [[ENTRY:.*:]]
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    ret void
+//
+void fe1a() {
+  __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WRKGRP);
+}
+
+// AMDGCN-LABEL: define hidden void @fe1b(
+// AMDGCN-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ORD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORD_ADDR]] to ptr
+// AMDGCN-NEXT:    store i32 [[ORD]], ptr [[ORD_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// AMDGCN-NEXT:      i32 2, label %[[ACQUIRE]]
+// AMDGCN-NEXT:      i32 3, label %[[RELEASE:.*]]
+// AMDGCN-NEXT:      i32 4, label %[[ACQREL:.*]]
+// AMDGCN-NEXT:      i32 5, label %[[SEQCST:.*]]
+// AMDGCN-NEXT:    ]
+// AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN-NEXT:    ret void
+// AMDGCN:       [[ACQUIRE]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") acquire
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[RELEASE]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[ACQREL]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") acq_rel
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[SEQCST]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup") seq_cst
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+// SPIRV-LABEL: define hidden spir_func void @fe1b(
+// SPIRV-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i32 [[ORD]], ptr [[ORD_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR]], align 4
+// SPIRV-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// SPIRV-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// SPIRV-NEXT:      i32 2, label %[[ACQUIRE]]
+// SPIRV-NEXT:      i32 3, label %[[RELEASE:.*]]
+// SPIRV-NEXT:      i32 4, label %[[ACQREL:.*]]
+// SPIRV-NEXT:      i32 5, label %[[SEQCST:.*]]
+// SPIRV-NEXT:    ]
+// SPIRV:       [[ATOMIC_SCOPE_CONTINUE]]:
+// SPIRV-NEXT:    ret void
+// SPIRV:       [[ACQUIRE]]:
+// SPIRV-NEXT:    fence syncscope("workgroup") acquire
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[RELEASE]]:
+// SPIRV-NEXT:    fence syncscope("workgroup") release
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[ACQREL]]:
+// SPIRV-NEXT:    fence syncscope("workgroup") acq_rel
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[SEQCST]]:
+// SPIRV-NEXT:    fence syncscope("workgroup") seq_cst
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+// X86_64-LABEL: define hidden void @fe1b(
+// X86_64-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
+// X86_64-NEXT:  [[ENTRY:.*:]]
+// X86_64-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4
+// X86_64-NEXT:    store i32 [[ORD]], ptr [[ORD_ADDR]], align 4
+// X86_64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR]], align 4
+// X86_64-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// X86_64-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// X86_64-NEXT:      i32 2, label %[[ACQUIRE]]
+// X86_64-NEXT:      i32 3, label %[[RELEASE:.*]]
+// X86_64-NEXT:      i32 4, label %[[ACQREL:.*]]
+// X86_64-NEXT:      i32 5, label %[[SEQCST:.*]]
+// X86_64-NEXT:    ]
+// X86_64:       [[ATOMIC_SCOPE_CONTINUE]]:
+// X86_64-NEXT:    ret void
+// X86_64:       [[ACQUIRE]]:
+// X86_64-NEXT:    fence acquire
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64:       [[RELEASE]]:
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64:       [[ACQREL]]:
+// X86_64-NEXT:    fence acq_rel
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64:       [[SEQCST]]:
+// X86_64-NEXT:    fence seq_cst
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+void fe1b(int ord) {
+  __scoped_atomic_thread_fence(ord, __MEMORY_SCOPE_WRKGRP);
+}
+
+// AMDGCN-LABEL: define hidden void @fe1c(
+// AMDGCN-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[SCOPE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCOPE_ADDR]] to ptr
+// AMDGCN-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
+// AMDGCN-NEXT:    ]
+// AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN-NEXT:    ret void
+// AMDGCN:       [[DEVICE_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("agent-one-as") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[SYSTEM_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("one-as") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[WORKGROUP_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[WAVEFRONT_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("wavefront-one-as") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[SINGLE_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("singlethread-one-as") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+// SPIRV-LABEL: define hidden spir_func void @fe1c(
+// SPIRV-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4
+// SPIRV-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// SPIRV-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
+// SPIRV-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// SPIRV-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// SPIRV-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
+// SPIRV-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
+// SPIRV-NEXT:    ]
+// SPIRV:       [[ATOMIC_SCOPE_CONTINUE]]:
+// SPIRV-NEXT:    ret void
+// SPIRV:       [[DEVICE_SCOPE]]:
+// SPIRV-NEXT:    fence syncscope("device") release
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[SYSTEM_SCOPE]]:
+// SPIRV-NEXT:    fence release
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[WORKGROUP_SCOPE]]:
+// SPIRV-NEXT:    fence syncscope("workgroup") release
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[WAVEFRONT_SCOPE]]:
+// SPIRV-NEXT:    fence syncscope("subgroup") release
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[SINGLE_SCOPE]]:
+// SPIRV-NEXT:    fence syncscope("singlethread") release
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+// X86_64-LABEL: define hidden void @fe1c(
+// X86_64-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
+// X86_64-NEXT:  [[ENTRY:.*:]]
+// X86_64-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4
+// X86_64-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4
+// X86_64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4
+// X86_64-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// X86_64-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
+// X86_64-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// X86_64-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// X86_64-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
+// X86_64-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
+// X86_64-NEXT:    ]
+// X86_64:       [[ATOMIC_SCOPE_CONTINUE]]:
+// X86_64-NEXT:    ret void
+// X86_64:       [[DEVICE_SCOPE]]:
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64:       [[SYSTEM_SCOPE]]:
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64:       [[WORKGROUP_SCOPE]]:
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64:       [[WAVEFRONT_SCOPE]]:
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64:       [[SINGLE_SCOPE]]:
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
+void fe1c(int scope) {
+  __scoped_atomic_thread_fence(__ATOMIC_RELEASE, scope);
+}
+
+// AMDGCN-LABEL: define hidden void @fe2a(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    ret void
+//
+// SPIRV-LABEL: define hidden spir_func void @fe2a(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    ret void
+//
+// X86_64-LABEL: define hidden void @fe2a(
+// X86_64-SAME: ) #[[ATTR0]] {
+// X86_64-NEXT:  [[ENTRY:.*:]]
+// X86_64-NEXT:    ret void
+//
+void fe2a() {
+  __scoped_atomic_thread_fence(999, __MEMORY_SCOPE_SYSTEM);
+}
+
+// AMDGCN-LABEL: define hidden void @fe2b(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    fence syncscope("one-as") release
+// AMDGCN-NEXT:    ret void
+//
+// SPIRV-LABEL: define hidden spir_func void @fe2b(
+// SPIRV-SAME: ) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    fence release
+// SPIRV-NEXT:    ret void
+//
+// X86_64-LABEL: define hidden void @fe2b(
+// X86_64-SAME: ) #[[ATTR0]] {
+// X86_64-NEXT:  [[ENTRY:.*:]]
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    ret void
+//
+void fe2b() {
+  __scoped_atomic_thread_fence(__ATOMIC_RELEASE, 999);
+}
diff --git a/clang/test/CodeGenCUDASPIRV/spirv-attrs.cu b/clang/test/CodeGenCUDASPIRV/spirv-attrs.cu
new file mode 100644
index 0000000000000..466aee00717a0
--- /dev/null
+++ b/clang/test/CodeGenCUDASPIRV/spirv-attrs.cu
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -fcuda-is-device -triple spirv64 -o - -emit-llvm -x cuda %s  | FileCheck %s
+// RUN: %clang_cc1 -fcuda-is-device -triple spirv32 -o - -emit-llvm -x cuda %s  | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+__attribute__((reqd_work_group_size(128, 1, 1)))
+__global__ void reqd_work_group_size_128_1_1() {}
+
+__attribute__((work_group_size_hint(2, 2, 2)))
+__global__ void work_group_size_hint_2_2_2() {}
+
+__attribute__((vec_type_hint(int)))
+__global__ void vec_type_hint_int() {}
+
+__attribute__((intel_reqd_sub_group_size(64)))
+__global__ void intel_reqd_sub_group_size_64() {}
+
+// CHECK: define spir_kernel void @_Z28reqd_work_group_size_128_1_1v() #[[ATTR:[0-9]+]] !reqd_work_group_size ![[WG_SIZE:[0-9]+]]
+// CHECK: define spir_kernel void @_Z26work_group_size_hint_2_2_2v() #[[ATTR]] !work_group_size_hint ![[WG_HINT:[0-9]+]]
+// CHECK: define spir_kernel void @_Z17vec_type_hint_intv() #[[ATTR]] !vec_type_hint ![[VEC_HINT:[0-9]+]]
+// CHECK: define spir_kernel void @_Z28intel_reqd_sub_group_size_64v() #[[ATTR]] !intel_reqd_sub_group_size ![[SUB_GRP:[0-9]+]]
+
+// CHECK: attributes #[[ATTR]] = { {{.*}} }
+
+// CHECK: ![[WG_SIZE]] = !{i32 128, i32 1, i32 1}
+// CHECK: ![[WG_HINT]] = !{i32 2, i32 2, i32 2}
+// CHECK: ![[VEC_HINT]] = !{i32 undef, i32 1}
+// CHECK: ![[SUB_GRP]] = !{i32 64}
diff --git a/clang/test/CodeGenCXX/auto-var-init-attr.cpp b/clang/test/CodeGenCXX/auto-var-init-attr.cpp
new file mode 100644
index 0000000000000..5481c6e8613c5
--- /dev/null
+++ b/clang/test/CodeGenCXX/auto-var-init-attr.cpp
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -fdeclspec -ftrivial-auto-var-init=zero %s -emit-llvm -o - | FileCheck %s
+
+struct S { char c; };
+class C { char c; };
+enum class E { ZERO };
+union U { char c; int i; };
+
+struct __declspec(no_init_all) NoInitS { char c; };
+class __declspec(no_init_all) NoInitC { char c; };
+enum class __declspec(no_init_all) NoInitE { ZERO };
+union __declspec(no_init_all) NoInitU { char c; int i; };
+
+extern "C" {
+  void test_no_attr() {
+    // CHECK-LABEL: @test_no_attr()
+    // CHECK-NEXT:  entry:
+    // CHECK-NEXT:  %s = alloca %struct.S, align 1
+    // CHECK-NEXT:  %c = alloca %class.C, align 1
+    // CHECK-NEXT:  %e = alloca i32, align 4
+    // CHECK-NEXT:  %u = alloca %union.U, align 4
+    // CHECK-NEXT:  call void @llvm.memset.p0.i64(ptr align 1 %s, i8 0, i64 1, i1 false)
+    // CHECK-NEXT:  call void @llvm.memset.p0.i64(ptr align 1 %c, i8 0, i64 1, i1 false)
+    // CHECK-NEXT:  store i32 0, ptr %e, align 4
+    // CHECK-NEXT:  call void @llvm.memset.p0.i64(ptr align 4 %u, i8 0, i64 4, i1 false)
+    // CHECK-NEXT   ret void
+    S s;
+    C c;
+    E e;
+    U u;
+  }
+
+  void __declspec(no_init_all) test_attr_on_function() {
+    // CHECK-LABEL: @test_attr_on_function()
+    // CHECK-NEXT:  entry:
+    // CHECK-NEXT:  %s = alloca %struct.S, align 1
+    // CHECK-NEXT:  %c = alloca %class.C, align 1
+    // CHECK-NEXT:  %e = alloca i32, align 4
+    // CHECK-NEXT:  %u = alloca %union.U, align 4
+    // CHECK-NEXT:  ret void
+    S s;
+    C c;
+    E e;
+    U u;
+  }
+
+  void test_attr_on_decl() {
+    // CHECK-LABEL: @test_attr_on_decl()
+    // CHECK-NEXT:  entry:
+    // CHECK-NEXT:  %s = alloca %struct.NoInitS, align 1
+    // CHECK-NEXT:  %c = alloca %class.NoInitC, align 1
+    // CHECK-NEXT:  %e = alloca i32, align 4
+    // CHECK-NEXT:  %u = alloca %union.NoInitU, align 4
+    // CHECK-NEXT:  ret void
+    NoInitS s;
+    NoInitC c;
+    NoInitE e;
+    NoInitU u;
+  }
+}
\ No newline at end of file
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 8b56ec94f2c4e..61cbf5e65d0d2 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -32,6 +32,7 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s
@@ -88,6 +89,7 @@
 // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
new file mode 100644
index 0000000000000..86f4f73c81c0f
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx906 -emit-llvm \
+// RUN:   -verify -o - %s
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -emit-llvm \
+// RUN:   -verify -o - %s
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 -emit-llvm \
+// RUN:   -verify -o - %s
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm \
+// RUN:   -verify -o - %s
+
+
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int uint;
+void test_prng_b32(global uint* out, uint a) {
+  *out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
new file mode 100644
index 0000000000000..f31ba85a52a7a
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -0,0 +1,21 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL1.2 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int uint;
+
+// CHECK-LABEL: @test_prng_b32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_prng_b32(global uint* out, uint a) {
+  *out = __builtin_amdgcn_prng_b32(a);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index dcdeee6b6acc4..841d8fcad0fee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950
 
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 
@@ -23,6 +24,7 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 typedef short  v32s  __attribute__((ext_vector_type(32)));
 typedef double v4d   __attribute__((ext_vector_type(4)));
+typedef __bf16 v8bf16   __attribute__((ext_vector_type(8)));
 
 
 #ifdef MFMA_GFX908_TESTS
@@ -222,7 +224,7 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
 
 #endif // MFMA_GFX90A_TESTS
 
-#ifdef MFMA_GFX940_TESTS
+#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
 // CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
 void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
@@ -404,4 +406,29 @@ void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, in
 {
   *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
 }
-#endif // MFMA_GFX940_TESTS
+#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
+
+#ifdef MFMA_GFX950_TESTS
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16(
+// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3)
+
+v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c)
+{
+  return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3);
+}
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %a, <8 x half> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
+{
+  return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
+}
+
+// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
+// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
+v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
+  return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
+}
+
+#endif
diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl
index d354f933c5ad7..d97b2ddb1fc66 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -110,6 +110,7 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx940 -DFAMILY=GFX9
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx941 -DFAMILY=GFX9
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx942 -DFAMILY=GFX9
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx950 -DFAMILY=GFX9
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010 -DFAMILY=GFX10
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1011 -DFAMILY=GFX10
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1012 -DFAMILY=GFX10
diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl
index ba57843507298..7c34d3ec6c63a 100644
--- a/clang/test/Driver/amdgpu-mcpu.cl
+++ b/clang/test/Driver/amdgpu-mcpu.cl
@@ -95,6 +95,7 @@
 // RUN: %clang -### -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefix=GFX941 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefix=GFX942 %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefix=GFX950 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefix=GFX1011 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefix=GFX1012 %s
@@ -150,6 +151,7 @@
 // GFX940:    "-target-cpu" "gfx940"
 // GFX941:    "-target-cpu" "gfx941"
 // GFX942:    "-target-cpu" "gfx942"
+// GFX950:    "-target-cpu" "gfx950"
 // GFX1010:   "-target-cpu" "gfx1010"
 // GFX1011:   "-target-cpu" "gfx1011"
 // GFX1012:   "-target-cpu" "gfx1012"
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 2cfbe256bc745..ddbf1fd951c84 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -71,9 +71,6 @@
 // RUN: %clang -### -S -fauto-profile=%S/Inputs/file.prof -fno-auto-profile %s 2>&1 | FileCheck -check-prefix=CHECK-NO-AUTO-PROFILE %s
 // CHECK-NO-AUTO-PROFILE-NOT: "-fprofile-sample-use={{.*}}/file.prof"
 
-// RUN: %clang -### -S -fauto-profile=%S/Inputs/file.prof -fno-profile-sample-use -fauto-profile %s 2>&1 | FileCheck -check-prefix=CHECK-AUTO-PROFILE %s
-// RUN: %clang -### -S -fauto-profile=%S/Inputs/file.prof -fno-auto-profile -fprofile-sample-use %s 2>&1 | FileCheck -check-prefix=CHECK-AUTO-PROFILE %s
-
 // RUN: %clang -### -S -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-LLVM %s
 // RUN: %clang -### -S -fprofile-instr-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE %s
 // RUN: %clang -### -S -fprofile-generate=/some/dir %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-DIR %s
diff --git a/clang/test/Driver/fprofile-sample-use.c b/clang/test/Driver/fprofile-sample-use.c
new file mode 100644
index 0000000000000..7c8813a83785c
--- /dev/null
+++ b/clang/test/Driver/fprofile-sample-use.c
@@ -0,0 +1,5 @@
+/// GCC -fauto-profile (without =) is rejected.
+/// -fprofile-sample-use without = is rejected as well.
+// RUN: not %clang -### -S -fauto-profile -fprofile-sample-use %s 2>&1 | FileCheck %s --check-prefix=ERR
+// ERR: error: unknown argument: '-fauto-profile'
+// ERR: error: unknown argument: '-fprofile-sample-use'
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index 6ecf0b57bee5c..15f190165a7d7 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -866,6 +866,13 @@
 // CHECK-INTSAN-MINIMAL: "-fsanitize=integer-divide-by-zero,shift-base,shift-exponent,signed-integer-overflow,unsigned-integer-overflow,unsigned-shift-base,implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change"
 // CHECK-INTSAN-MINIMAL: "-fsanitize-minimal-runtime"
 
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=implicit-conversion -fsanitize-trap=implicit-conversion %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IMPL-CONV-TRAP
+// CHECK-IMPL-CONV-TRAP: "-fsanitize-trap=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change,implicit-bitfield-conversion"
+
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=implicit-conversion -fsanitize-minimal-runtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IMPL-CONV-MINIMAL
+// CHECK-IMPL-CONV-MINIMAL: "-fsanitize=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change,implicit-bitfield-conversion"
+// CHECK-IMPL-CONV-MINIMAL: "-fsanitize-minimal-runtime"
+
 // RUN: %clang --target=aarch64-linux-android -march=armv8-a+memtag -fsanitize=memtag -fsanitize-minimal-runtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MEMTAG-MINIMAL
 // CHECK-MEMTAG-MINIMAL: "-fsanitize=memtag-stack,memtag-heap,memtag-globals"
 // CHECK-MEMTAG-MINIMAL: "-fsanitize-minimal-runtime"
diff --git a/clang/test/Driver/module-output.cppm b/clang/test/Driver/module-output.cppm
index bf7bfbf3cb574..7cf0771f3d6ef 100644
--- a/clang/test/Driver/module-output.cppm
+++ b/clang/test/Driver/module-output.cppm
@@ -42,7 +42,7 @@ export module Hello;
 // CHECK: "-emit-module-interface" {{.*}}"-main-file-name" "Hello.cppm" {{.*}}"-o" "{{.*}}/output/Hello.pcm" "-x" "c++" "{{.*}}/Hello.cppm"
 // CHECK: "-emit-obj" {{.*}}"-main-file-name" "Hello.cppm" {{.*}}"-o" "{{.*}}/output/Hello.o" "-x" "pcm" "{{.*}}/output/Hello.pcm"
 
-// MULTIPLE-ARCH: option '-fmodule-output' can't be used with multiple arch options
+// MULTIPLE-ARCH: option '-fmodule-output' cannot be used with multiple arch options
 
 // CHECK-SPECIFIED: "-emit-module-interface" {{.*}}"-main-file-name" "Hello.cppm" {{.*}}"-o" "{{.*}}/pcm/Hello.pcm" "-x" "c++" "{{.*}}/Hello.cppm"
 // CHECK-SPECIFIED: "-emit-obj" {{.*}}"-main-file-name" "Hello.cppm" {{.*}}"-o" "{{.*}}/Hello.o" "-x" "pcm" "{{.*}}/pcm/Hello.pcm"
diff --git a/clang/test/Driver/relax.s b/clang/test/Driver/relax.s
index 0768a38834447..7b084de7e6be2 100644
--- a/clang/test/Driver/relax.s
+++ b/clang/test/Driver/relax.s
@@ -8,7 +8,7 @@
 // RUN: llvm-readobj -r %t | FileCheck --check-prefix=REL %s
 
 // REL: R_X86_64_REX_GOTPCRELX foo
-// REL: R_X86_64_REX2_GOTPCRELX foo
+// REL: R_X86_64_CODE_4_GOTPCRELX foo
 
         movq	foo@GOTPCREL(%rip), %rax
         movq	foo@GOTPCREL(%rip), %r16
diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index d36639d16ad4c..249216612f7ee 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -104,6 +104,68 @@
 // RUN: %clang --target=riscv32 -### -c %s 2>&1 -mtune=syntacore-scr1-max | FileCheck -check-prefix=MTUNE-SYNTACORE-SCR1-MAX %s
 // MTUNE-SYNTACORE-SCR1-MAX: "-tune-cpu" "syntacore-scr1-max"
 
+// RUN: %clang --target=riscv64 -### -c %s 2>&1 -mtune=tt-ascalon-d8 | FileCheck -check-prefix=MTUNE-TT-ASCALON-D8 %s
+// MTUNE-TT-ASCALON-D8: "-tune-cpu" "tt-ascalon-d8"
+
+// RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=tt-ascalon-d8 | FileCheck -check-prefix=MCPU-TT-ASCALON-D8 %s
+// MCPU-TT-ASCALON-D8: "-target-cpu" "tt-ascalon-d8"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+m"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+a"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+f"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+d"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+c"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+v"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+h"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zicbom"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zicbop"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zicboz"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zicntr"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zicond"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zicsr"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zifencei"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zihintntl"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zihintpause"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zihpm"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zimop"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zmmul"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zawrs"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zfa"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zfbfmin"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zfh"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zfhmin"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zca"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zcb"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zba"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zbb"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zbs"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zkt"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvbb"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvbc"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zve32f"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zve32x"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zve64d"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zve64f"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zve64x"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvfbfmin"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvfbfwma"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvfh"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvfhmin"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvkb"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvkg"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvkn"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvknc"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvkned"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvkng"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvknhb"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvkt"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvl128b"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvl256b"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvl32b"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+zvl64b"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+svinval"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+svnapot"
+// MCPU-TT-ASCALON-D8-SAME: "-target-feature" "+svpbmt"
+
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=veyron-v1 | FileCheck -check-prefix=MCPU-VEYRON-V1 %s
 // MCPU-VEYRON-V1: "-target-cpu" "veyron-v1"
 // MCPU-VEYRON-V1: "-target-feature" "+m"
diff --git a/clang/test/Misc/pragma-attribute-strict-subjects.c b/clang/test/Misc/pragma-attribute-strict-subjects.c
index 7c2548c7dfc26..807977fb252aa 100644
--- a/clang/test/Misc/pragma-attribute-strict-subjects.c
+++ b/clang/test/Misc/pragma-attribute-strict-subjects.c
@@ -51,7 +51,7 @@ struct testRecoverStrictnessStruct { };
 #pragma clang attribute pop
 
 #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(function, record(unless(is_union)), variable, enum))
-// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}}
+// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}}
 
 int testRecoverExtraVar = 0;
 // CHECK-LABEL: VarDecl{{.*}} testRecoverExtraVar
@@ -188,7 +188,7 @@ struct testSubset7Struct { };
 
 
 #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(record(unless(is_union)), function, variable, enum, enum_constant))
-// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum_constant', and 'enum'}}
+// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum_constant', and 'enum'}}
 
 int testSubsetRecoverVar;
 // CHECK-LABEL: VarDecl{{.*}} testSubsetRecoverVar
@@ -205,7 +205,7 @@ struct testSubsetRecoverStruct { };
 #pragma clang attribute pop
 
 #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = enum)
-// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}}
+// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}}
 
 int testSubsetNoVar;
 // CHECK-LABEL: VarDecl{{.*}} testSubsetNoVar
diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
index 4e675871f1e5b..642d2df211c21 100644
--- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
+++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
@@ -48,6 +48,7 @@
 // CHECK-SAME: {{^}}, gfx940
 // CHECK-SAME: {{^}}, gfx941
 // CHECK-SAME: {{^}}, gfx942
+// CHECK-SAME: {{^}}, gfx950
 // CHECK-SAME: {{^}}, gfx1010
 // CHECK-SAME: {{^}}, gfx1011
 // CHECK-SAME: {{^}}, gfx1012
diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
index 44fe07065b242..3ea6c02d6b384 100644
--- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c
+++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
@@ -54,6 +54,7 @@
 // CHECK-SAME: {{^}}, gfx940
 // CHECK-SAME: {{^}}, gfx941
 // CHECK-SAME: {{^}}, gfx942
+// CHECK-SAME: {{^}}, gfx950
 // CHECK-SAME: {{^}}, gfx10-1-generic
 // CHECK-SAME: {{^}}, gfx1010
 // CHECK-SAME: {{^}}, gfx1011
diff --git a/clang/test/Misc/target-invalid-cpu-note/riscv.c b/clang/test/Misc/target-invalid-cpu-note/riscv.c
index 7bbf3574af3c3..8c5df5884cd79 100644
--- a/clang/test/Misc/target-invalid-cpu-note/riscv.c
+++ b/clang/test/Misc/target-invalid-cpu-note/riscv.c
@@ -41,6 +41,7 @@
 // RISCV64-SAME: {{^}}, syntacore-scr4-rv64
 // RISCV64-SAME: {{^}}, syntacore-scr5-rv64
 // RISCV64-SAME: {{^}}, syntacore-scr7
+// RISCV64-SAME: {{^}}, tt-ascalon-d8
 // RISCV64-SAME: {{^}}, veyron-v1
 // RISCV64-SAME: {{^}}, xiangshan-nanhu
 // RISCV64-SAME: {{$}}
@@ -87,6 +88,7 @@
 // TUNE-RISCV64-SAME: {{^}}, syntacore-scr4-rv64
 // TUNE-RISCV64-SAME: {{^}}, syntacore-scr5-rv64
 // TUNE-RISCV64-SAME: {{^}}, syntacore-scr7
+// TUNE-RISCV64-SAME: {{^}}, tt-ascalon-d8
 // TUNE-RISCV64-SAME: {{^}}, veyron-v1
 // TUNE-RISCV64-SAME: {{^}}, xiangshan-nanhu
 // TUNE-RISCV64-SAME: {{^}}, generic
diff --git a/clang/test/Modules/no-eager-load.cppm b/clang/test/Modules/no-eager-load.cppm
index c9eddaaed1555..aa6de44c998f3 100644
--- a/clang/test/Modules/no-eager-load.cppm
+++ b/clang/test/Modules/no-eager-load.cppm
@@ -44,7 +44,7 @@ void use() {
            // expected-note@* {{but in 'a' found a different body}}
 }
 
-// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' can't be attached to other modules}}
+// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' cannot be attached to other modules}}
 // expected-note@b.cppm:* {{}}
 
 //--- h.cppm
@@ -59,5 +59,5 @@ void use() {
            // expected-note@* {{but in 'a' found a different body}}
 }
 
-// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' can't be attached to other modules}}
+// expected-error@a.cppm:* {{declaration 'foo' attached to named module 'a' cannot be attached to other modules}}
 // expected-note@b.cppm:* {{}}
diff --git a/clang/test/Modules/same-decl-in-different-modules.cppm b/clang/test/Modules/same-decl-in-different-modules.cppm
index 2e8e90f7cd8e9..8ad9e29051d4e 100644
--- a/clang/test/Modules/same-decl-in-different-modules.cppm
+++ b/clang/test/Modules/same-decl-in-different-modules.cppm
@@ -32,11 +32,11 @@ void test() {
     S<int> s;
 }
 
-// expected-error@mod1.cppm:* {{declaration 'v' attached to named module 'mod1' can't be attached to other modules}}
+// expected-error@mod1.cppm:* {{declaration 'v' attached to named module 'mod1' cannot be attached to other modules}}
 // expected-note@mod2.cppm:* {{}}
-// expected-error@mod1.cppm:* {{declaration 'func' attached to named module 'mod1' can't be attached to other modules}}
+// expected-error@mod1.cppm:* {{declaration 'func' attached to named module 'mod1' cannot be attached to other modules}}
 // expected-note@mod2.cppm:* {{}}
-// expected-error@mod1.cppm:* {{declaration 'A' attached to named module 'mod1' can't be attached to other modules}}
+// expected-error@mod1.cppm:* {{declaration 'A' attached to named module 'mod1' cannot be attached to other modules}}
 // expected-note@mod2.cppm:* {{}}
-// expected-error@mod1.cppm:* 1+{{declaration 'S' attached to named module 'mod1' can't be attached to other modules}}
+// expected-error@mod1.cppm:* 1+{{declaration 'S' attached to named module 'mod1' cannot be attached to other modules}}
 // expected-note@mod2.cppm:* 1+{{}}
diff --git a/clang/test/OpenMP/for_simd_loop_messages.cpp b/clang/test/OpenMP/for_simd_loop_messages.cpp
index 1cc5988ea8092..74a52f3f5d694 100644
--- a/clang/test/OpenMP/for_simd_loop_messages.cpp
+++ b/clang/test/OpenMP/for_simd_loop_messages.cpp
@@ -731,7 +731,7 @@ void test_ordered() {
   for (int i = 0; i < 16; ++i)
     ;
 #pragma omp parallel
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp for simd' directive}}
 #pragma omp for simd ordered(1)
   for (int i = 0; i < 16; ++i)
     ;
diff --git a/clang/test/OpenMP/masked_taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/masked_taskloop_simd_linear_messages.cpp
index 50d2da7e8fd4d..6072ad1b92445 100644
--- a/clang/test/OpenMP/masked_taskloop_simd_linear_messages.cpp
+++ b/clang/test/OpenMP/masked_taskloop_simd_linear_messages.cpp
@@ -152,7 +152,7 @@ template<class I, class C> int foomain(I argc, C **argv) {
   #pragma omp masked taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}}
   for (int k = 0; k < argc; ++k) ++k;
 #if defined(OMP52)
-  // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}}
+  // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}}
   // expected-error@+2 {{linear variable with incomplete type 'S1'}}
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}}
   #pragma omp masked taskloop simd linear (a, b: val, B::ib)
diff --git a/clang/test/OpenMP/master_taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/master_taskloop_simd_linear_messages.cpp
index ee29f63e110c0..c1bf61b8183ec 100644
--- a/clang/test/OpenMP/master_taskloop_simd_linear_messages.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_linear_messages.cpp
@@ -152,7 +152,7 @@ template<class I, class C> int foomain(I argc, C **argv) {
   #pragma omp master taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}}
   for (int k = 0; k < argc; ++k) ++k;
 #if defined(OMP52)
-  // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}}
+  // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}}
   // expected-error@+2 {{linear variable with incomplete type 'S1'}}
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}}
   #pragma omp master taskloop simd linear (a, b: val, B::ib)
diff --git a/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp b/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp
index f55453f6e8e15..4760a0281df54 100644
--- a/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp
+++ b/clang/test/OpenMP/parallel_for_simd_loop_messages.cpp
@@ -638,7 +638,7 @@ void test_ordered() {
 #pragma omp parallel for simd ordered
   for (int i = 0; i < 16; ++i)
     ;
-//expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp parallel for simd' directive}}
+//expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp parallel for simd' directive}}
 #pragma omp parallel for simd ordered(1)
   for (int i = 0; i < 16; ++i)
     ;
diff --git a/clang/test/OpenMP/parallel_for_simd_messages.cpp b/clang/test/OpenMP/parallel_for_simd_messages.cpp
index 8237406a1c068..b3408fab4417f 100644
--- a/clang/test/OpenMP/parallel_for_simd_messages.cpp
+++ b/clang/test/OpenMP/parallel_for_simd_messages.cpp
@@ -94,7 +94,7 @@ void test_ordered() {
 #pragma omp parallel for simd ordered
   for (int i = 0; i < 16; ++i)
     ;
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp parallel for simd' directive}}
 #pragma omp parallel for simd ordered(1)
   for (int i = 0; i < 16; ++i)
     ;
diff --git a/clang/test/OpenMP/parallel_masked_taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/parallel_masked_taskloop_simd_linear_messages.cpp
index a913a4e331964..bda3ef09181a6 100644
--- a/clang/test/OpenMP/parallel_masked_taskloop_simd_linear_messages.cpp
+++ b/clang/test/OpenMP/parallel_masked_taskloop_simd_linear_messages.cpp
@@ -152,7 +152,7 @@ template<class I, class C> int foomain(I argc, C **argv) {
   #pragma omp parallel masked taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}}
   for (int k = 0; k < argc; ++k) ++k;
 #if defined(OMP52)
-  // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}}
+  // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}}
   // expected-error@+2 {{linear variable with incomplete type 'S1'}}
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}}
   #pragma omp parallel masked taskloop simd linear (a, b: val, B::ib)
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_linear_messages.cpp
index 2be29fdc6b929..01a734cd927e2 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_linear_messages.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_linear_messages.cpp
@@ -152,7 +152,7 @@ template<class I, class C> int foomain(I argc, C **argv) {
   #pragma omp parallel master taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}}
   for (int k = 0; k < argc; ++k) ++k;
 #if defined(OMP52)
-  // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}}
+  // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}}
   // expected-error@+2 {{linear variable with incomplete type 'S1'}}
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}}
   #pragma omp parallel master taskloop simd linear (a, b: val, B::ib)
diff --git a/clang/test/OpenMP/simd_linear_messages.cpp b/clang/test/OpenMP/simd_linear_messages.cpp
index a19fad9d7718a..68a2999fdf65a 100644
--- a/clang/test/OpenMP/simd_linear_messages.cpp
+++ b/clang/test/OpenMP/simd_linear_messages.cpp
@@ -142,7 +142,7 @@ template<class I, class C> int foomain(I argc, C **argv) {
   #pragma omp simd linear (S1) // expected-error {{'S1' does not refer to a value}}
   for (int k = 0; k < argc; ++k) ++k;
 #if defined(OMP52)
-  // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}}
+  // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}}
   // expected-error@+2 {{linear variable with incomplete type 'S1'}}
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}}
   #pragma omp simd linear (a, b: val, B::ib)
diff --git a/clang/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp b/clang/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp
index 8dd7f68c25fd8..73ea96eb24278 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp
@@ -29,26 +29,26 @@ T tmain(T argc, S **argv) {
 #pragma omp target parallel for simd ordered() // expected-error {{expected expression}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+2 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+2 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
 #pragma omp target parallel for simd ordered(argc
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
 // expected-error@+3 {{argument to 'ordered' clause must be a strictly positive integer value}}
 // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'ordered' clause}}
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(foobool(argc)), ordered(true), ordered(-5)
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
@@ -60,15 +60,15 @@ T tmain(T argc, S **argv) {
 #pragma omp target parallel for simd ordered(j = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(1)
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(N)
   for (T i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(2)
   foo();
   return argc;
@@ -85,11 +85,11 @@ int main(int argc, char **argv) {
 #pragma omp target parallel for simd ordered() // expected-error {{expected expression}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(2 + 2))      // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
@@ -115,7 +115,7 @@ int main(int argc, char **argv) {
 // expected-error@+2 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
 #pragma omp target parallel for simd ordered(ordered(tmain < int, char, -1, -2 > (argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
   foo();
-// expected-error@+1 {{'ordered' clause with a parameter can not be specified in '#pragma omp target parallel for simd' directive}}
+// expected-error@+1 {{'ordered' clause with a parameter cannot be specified in '#pragma omp target parallel for simd' directive}}
 #pragma omp target parallel for simd ordered(2)
   foo();
   return tmain<int, char, 1, 0>(argc, argv);
diff --git a/clang/test/OpenMP/taskloop_simd_linear_messages.cpp b/clang/test/OpenMP/taskloop_simd_linear_messages.cpp
index 22e2d26cb5561..5bf4785f14be4 100644
--- a/clang/test/OpenMP/taskloop_simd_linear_messages.cpp
+++ b/clang/test/OpenMP/taskloop_simd_linear_messages.cpp
@@ -152,7 +152,7 @@ template<class I, class C> int foomain(I argc, C **argv) {
   #pragma omp taskloop simd linear (S1) // expected-error {{'S1' does not refer to a value}}
   for (int k = 0; k < argc; ++k) ++k;
 #if defined(OMP52)
-  // omp52-error@+3{{step simple modifier is exclusive and can't be use with 'val', 'uval' or 'ref' modifier}}
+  // omp52-error@+3{{step simple modifier is exclusive and cannot be use with 'val', 'uval' or 'ref' modifier}}
   // expected-error@+2 {{linear variable with incomplete type 'S1'}}
   // expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S2'}}
   #pragma omp taskloop simd linear (a, b: val, B::ib)
diff --git a/clang/test/Parser/cxx2c-delete-with-message.cpp b/clang/test/Parser/cxx2c-delete-with-message.cpp
index 1767a080a7dcd..d2d5ccf4623c9 100644
--- a/clang/test/Parser/cxx2c-delete-with-message.cpp
+++ b/clang/test/Parser/cxx2c-delete-with-message.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify=expected,pre26 -pedantic %s
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify=expected,pre26 %s
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify=expected,pre26-pedantic -pedantic %s
 // RUN: %clang_cc1 -std=c++2c -fsyntax-only -verify=expected,compat -Wpre-c++26-compat %s
 // RUN: %clang_cc1 -std=c++2c -fsyntax-only -verify %s
 
@@ -7,15 +8,15 @@ struct S {
   void b() = delete(; // expected-error {{expected string literal}} expected-error {{expected ')'}} expected-note {{to match this '('}}
   void c() = delete(); // expected-error {{expected string literal}}
   void d() = delete(42); // expected-error {{expected string literal}}
-  void e() = delete("foo"[0]); // expected-error {{expected ')'}} expected-note {{to match this '('}} // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
-  void f() = delete("foo"); // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
-
-  S() = delete("foo"); // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
-  ~S() = delete("foo"); // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
-  S(const S&) = delete("foo"); // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
-  S(S&&) = delete("foo"); // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
-  S& operator=(const S&) = delete("foo"); // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
-  S& operator=(S&&) = delete("foo"); // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+  void e() = delete("foo"[0]); // expected-error {{expected ')'}} expected-note {{to match this '('}} // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+  void f() = delete("foo"); // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+
+  S() = delete("foo"); // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+  ~S() = delete("foo"); // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+  S(const S&) = delete("foo"); // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+  S(S&&) = delete("foo"); // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+  S& operator=(const S&) = delete("foo"); // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+  S& operator=(S&&) = delete("foo"); // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
 };
 
 struct T {
@@ -31,8 +32,8 @@ void a() = delete;
 void b() = delete(; // expected-error {{expected string literal}} expected-error {{expected ')'}} expected-note {{to match this '('}}
 void c() = delete(); // expected-error {{expected string literal}}
 void d() = delete(42); // expected-error {{expected string literal}}
-void e() = delete("foo"[0]); // expected-error {{expected ')'}} expected-note {{to match this '('}} // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
-void f() = delete("foo"); // pre26-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+void e() = delete("foo"[0]); // expected-error {{expected ')'}} expected-note {{to match this '('}} // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
+void f() = delete("foo"); // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2c}}
 
 constexpr const char *getMsg() { return "this is a message"; }
 void func() = delete(getMsg()); // expected-error {{expected string literal}}
@@ -49,3 +50,12 @@ struct C {
   U f = delete ("hello"); // expected-error {{cannot delete expression of type 'const char[6]'}}
 };
 }
+
+namespace GH109311 {
+void f() = delete
+#if __cpp_deleted_function >= 202403L
+    ("reason") // pre26-pedantic-warning {{'= delete' with a message is a C++2c extension}} \
+               // compat-warning {{'= delete' with a message is incompatible with C++ standards before C++2}}
+#endif
+;
+}
diff --git a/clang/test/Parser/pragma-attribute.cpp b/clang/test/Parser/pragma-attribute.cpp
index 6377fc754352e..d5b1f848abd06 100644
--- a/clang/test/Parser/pragma-attribute.cpp
+++ b/clang/test/Parser/pragma-attribute.cpp
@@ -124,7 +124,7 @@ void function();
 #pragma clang attribute push (__attribute__((annotate)), apply_to=function foo) // expected-error {{extra tokens after attribute in a '#pragma clang attribute push'}}
 
 #pragma clang attribute push (__attribute__((objc_bridge_related)), apply_to=function)
-// expected-error@-1 {{attribute 'objc_bridge_related' can't be applied to 'function'}}
+// expected-error@-1 {{attribute 'objc_bridge_related' cannot be applied to 'function'}}
 #pragma clang attribute pop
 
 #pragma clang attribute push (__attribute__((objc_bridge_related(1))), apply_to=function) // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
@@ -182,15 +182,15 @@ _Pragma("clang attribute pop");
 
 #pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_local))
 #pragma clang attribute pop
-#pragma clang attribute push([[clang::uninitialized]], apply_to = function) // expected-error {{attribute 'uninitialized' can't be applied to 'function'}}
+#pragma clang attribute push([[clang::uninitialized]], apply_to = function) // expected-error {{attribute 'uninitialized' cannot be applied to 'function'}}
 #pragma clang attribute pop
-#pragma clang attribute push([[clang::uninitialized]], apply_to = variable) // expected-error {{attribute 'uninitialized' can't be applied to 'variable'}}
+#pragma clang attribute push([[clang::uninitialized]], apply_to = variable) // expected-error {{attribute 'uninitialized' cannot be applied to 'variable'}}
 #pragma clang attribute pop
-#pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_thread_local)) // expected-error {{attribute 'uninitialized' can't be applied to 'variable(is_thread_local)'}}
+#pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_thread_local)) // expected-error {{attribute 'uninitialized' cannot be applied to 'variable(is_thread_local)'}}
 #pragma clang attribute pop
-#pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_global)) // expected-error {{attribute 'uninitialized' can't be applied to 'variable(is_global)'}}
+#pragma clang attribute push([[clang::uninitialized]], apply_to = variable(is_global)) // expected-error {{attribute 'uninitialized' cannot be applied to 'variable(is_global)'}}
 #pragma clang attribute pop
-#pragma clang attribute push([[clang::uninitialized]], apply_to = any(variable(is_parameter), variable(unless(is_parameter)))) // expected-error {{attribute 'uninitialized' can't be applied to 'variable(is_parameter)', and 'variable(unless(is_parameter))'}}
+#pragma clang attribute push([[clang::uninitialized]], apply_to = any(variable(is_parameter), variable(unless(is_parameter)))) // expected-error {{attribute 'uninitialized' cannot be applied to 'variable(is_parameter)', and 'variable(unless(is_parameter))'}}
 #pragma clang attribute pop
 // We're allowed to apply attributes to subsets of allowed subjects.
 #pragma clang attribute push([[clang::no_destroy]], apply_to = variable)
diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
index 38d27bc21e4aa..c20515716ae6b 100644
--- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
@@ -3,7 +3,7 @@
 // CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> class RWBuffer definition implicit_instantiation
 // CHECK: -TemplateArgument type 'float'
 // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
@@ -13,7 +13,7 @@ RWBuffer<float> Buffer1;
 // CHECK: -TemplateArgument type 'vector<float, 4>'
 // CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector<float, 4>' 4
 // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector<float, 4>)]]
diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c
index 2a3edc23f4753..a440791d6cc69 100644
--- a/clang/test/Preprocessor/ptrauth_feature.c
+++ b/clang/test/Preprocessor/ptrauth_feature.c
@@ -2,34 +2,37 @@
 //// For example, -fptrauth-init-fini will not affect codegen without -fptrauth-calls, but the preprocessor feature would be set anyway.
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-intrinsics | \
-// RUN:   FileCheck %s --check-prefixes=INTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=INTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-calls | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,CALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,CALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-returns | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,RETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,RETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-vtable-pointer-address-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-vtable-pointer-type-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-type-info-vtable-pointer-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,TYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,TYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-function-pointer-type-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,FUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,FUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-init-fini | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,INITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,INITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-init-fini-address-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,INITFINI_ADDR_DISCR,NOGOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,INITFINI_ADDR_DISCR,NOGOTOS,NOELFGOT
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-indirect-gotos | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,GOTOS
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,GOTOS,NOELFGOT
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-elf-got | \
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,ELFGOT
 
 #if __has_feature(ptrauth_intrinsics)
 // INTRIN: has_ptrauth_intrinsics
@@ -119,3 +122,11 @@ void has_ptrauth_indirect_gotos() {}
 // NOGOTOS: no_ptrauth_indirect_gotos
 void no_ptrauth_indirect_gotos() {}
 #endif
+
+#if __has_feature(ptrauth_elf_got)
+// ELFGOT: has_ptrauth_elf_got
+void has_ptrauth_elf_got() {}
+#else
+// NOELFGOT: no_ptrauth_elf_got
+void no_ptrauth_elf_got() {}
+#endif
diff --git a/clang/test/Refactor/Extract/ObjCProperty.m b/clang/test/Refactor/Extract/ObjCProperty.m
index 152ccb3484215..23c9a8941b7ab 100644
--- a/clang/test/Refactor/Extract/ObjCProperty.m
+++ b/clang/test/Refactor/Extract/ObjCProperty.m
@@ -36,6 +36,6 @@ - (void)prohibitSetterExtraction {
   /*range prohibit_setter=->+0:55*/self.implicitSetter = 0;
 }
 // CHECK: 2 'prohibit_setter' results:
-// CHECK: the selected expression can't be extracted
+// CHECK: the selected expression cannot be extracted
 
 @end
diff --git a/clang/test/Sema/Inputs/lifetime-analysis.h b/clang/test/Sema/Inputs/lifetime-analysis.h
new file mode 100644
index 0000000000000..41d1e2f074cc8
--- /dev/null
+++ b/clang/test/Sema/Inputs/lifetime-analysis.h
@@ -0,0 +1,138 @@
+
+namespace __gnu_cxx {
+template <typename T>
+struct basic_iterator {
+  basic_iterator operator++();
+  T& operator*() const;
+  T* operator->() const;
+};
+
+template<typename T>
+bool operator!=(basic_iterator<T>, basic_iterator<T>);
+}
+
+namespace std {
+template<typename T> struct remove_reference       { typedef T type; };
+template<typename T> struct remove_reference<T &>  { typedef T type; };
+template<typename T> struct remove_reference<T &&> { typedef T type; };
+
+template<typename T>
+typename remove_reference<T>::type &&move(T &&t) noexcept;
+
+template <typename C>
+auto data(const C &c) -> decltype(c.data());
+
+template <typename C>
+auto begin(C &c) -> decltype(c.begin());
+
+template<typename T, int N>
+T *begin(T (&array)[N]);
+
+using size_t = decltype(sizeof(0));
+
+template<typename T>
+struct initializer_list {
+  const T* ptr; size_t sz;
+};
+template<typename T> class allocator {};
+template <typename T, typename Alloc = allocator<T>>
+struct vector {
+  typedef __gnu_cxx::basic_iterator<T> iterator;
+  iterator begin();
+  iterator end();
+  const T *data() const;
+  vector();
+  vector(initializer_list<T> __l,
+         const Alloc& alloc = Alloc());
+
+  template<typename InputIterator>
+	vector(InputIterator first, InputIterator __last);
+
+  T &at(int n);
+};
+
+template<typename T>
+struct basic_string_view {
+  basic_string_view();
+  basic_string_view(const T *);
+  const T *begin() const;
+};
+using string_view = basic_string_view<char>;
+
+template<class _Mystr> struct iter {
+    iter& operator-=(int);
+
+    iter operator-(int _Off) const {
+        iter _Tmp = *this;
+        return _Tmp -= _Off;
+    }
+};
+
+template<typename T>
+struct basic_string {
+  basic_string();
+  basic_string(const T *);
+  const T *c_str() const;
+  operator basic_string_view<T> () const;
+  using const_iterator = iter<T>;
+};
+using string = basic_string<char>;
+
+template<typename T>
+struct unique_ptr {
+  T &operator*();
+  T *get() const;
+};
+
+template<typename T>
+struct optional {
+  optional();
+  optional(const T&);
+
+  template<typename U = T>
+  optional(U&& t);
+
+  template<typename U>
+  optional(optional<U>&& __t);
+
+  T &operator*() &;
+  T &&operator*() &&;
+  T &value() &;
+  T &&value() &&;
+};
+template<typename T>
+optional<__decay(T)> make_optional(T&&);
+
+
+template<typename T>
+struct stack {
+  T &top();
+};
+
+struct any {};
+
+template<typename T>
+T any_cast(const any& operand);
+
+template<typename T>
+struct reference_wrapper {
+  template<typename U>
+  reference_wrapper(U &&);
+};
+
+template<typename T>
+reference_wrapper<T> ref(T& t) noexcept;
+
+struct false_type {
+    static constexpr bool value = false;
+    constexpr operator bool() const noexcept { return value; }
+};
+struct true_type {
+    static constexpr bool value = true;
+    constexpr operator bool() const noexcept { return value; }
+};
+
+template<class T> struct is_pointer : false_type {};
+template<class T> struct is_pointer<T*> : true_type {};
+template<class T> struct is_pointer<T* const> : true_type {};
+}
diff --git a/clang/test/Sema/asm.c b/clang/test/Sema/asm.c
index 28ef3ec6ce09c..a9cff5947ef5d 100644
--- a/clang/test/Sema/asm.c
+++ b/clang/test/Sema/asm.c
@@ -90,7 +90,7 @@ int test7(unsigned long long b) {
 
 // PR3904
 void test8(int i) {
-  // A number in an input constraint can't point to a read-write constraint.
+  // A number in an input constraint cannot point to a read-write constraint.
   asm("" : "+r" (i), "=r"(i) :  "0" (i)); // expected-error{{invalid input constraint '0' in asm}}
 }
 
@@ -359,7 +359,7 @@ void test19(long long x)
   asm ("" : "=rm" (x): "0" (a)); // expected-error {{unsupported inline asm: input with type 'st_size64' (aka 'struct _st_size64') matching output with type 'long long'}}
   // FIXME: This case is actually supported by codegen.
   asm ("" : "=rm" (a): "0" (d)); // expected-error {{unsupported inline asm: input with type 'st_size32' (aka 'struct _st_size32') matching output with type 'st_size64' (aka 'struct _st_size64')}}
-  asm ("" : "=rm" (b): "0" (1)); // expected-error {{impossible constraint in asm: can't store value into a register}}
+  asm ("" : "=rm" (b): "0" (1)); // expected-error {{impossible constraint in asm: cannot store value into a register}}
   // FIXME: This case should be supported by codegen, but it fails now.
   asm ("" : "=rm" (e): "0" (1)); // no-error
   // FIXME: This case should be supported by codegen, but it fails now.
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index cc9108c0a4fbd..bbc909f627f4c 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -144,6 +144,41 @@ void nb9() [[clang::nonblocking]]
 		expected-note {{in template expansion here}}
 }
 
+// Make sure we verify lambdas produced from template expansions.
+struct HasTemplatedLambda {
+	void (*fptr)() [[clang::nonblocking]];
+
+	template <typename C>
+	HasTemplatedLambda(const C&)
+		: fptr{ []() [[clang::nonblocking]] {
+			auto* y = new int; // expected-warning {{lambda with 'nonblocking' attribute must not allocate or deallocate memory}}
+		} }
+	{}
+};
+
+void nb9a()
+{
+	HasTemplatedLambda bad(42);
+}
+
+// Templated function and lambda.
+template <typename T>
+void TemplatedFunc(T x) [[clang::nonblocking]] {
+	auto* ptr = new T; // expected-warning {{function with 'nonblocking' attribute must not allocate or deallocate memory}}
+}
+
+void nb9b() [[clang::nonblocking]] {
+	TemplatedFunc(42); // expected-note {{in template expansion here}}
+
+	auto foo = [](auto x) [[clang::nonblocking]] {
+		auto* ptr = new int; // expected-warning {{lambda with 'nonblocking' attribute must not allocate or deallocate memory}}
+		return x;
+	};
+
+	// Note that foo() won't be validated unless instantiated.
+	foo(42);
+}
+
 void nb10(
 	void (*fp1)(), // expected-note {{function pointer cannot be inferred 'nonblocking'}}
 	void (*fp2)() [[clang::nonblocking]]
diff --git a/clang/test/Sema/c2x-nodiscard.c b/clang/test/Sema/c2x-nodiscard.c
index f8b0567366465..e2537bcf1d29d 100644
--- a/clang/test/Sema/c2x-nodiscard.c
+++ b/clang/test/Sema/c2x-nodiscard.c
@@ -31,10 +31,10 @@ enum E2 get_e(void);
 [[nodiscard]] int get_i(void);
 
 void f2(void) {
-  get_s(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  get_s3(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: Wrong}}
+  get_s(); // expected-warning {{ignoring return value of type 'S4' declared with 'nodiscard' attribute}}
+  get_s3(); // expected-warning {{ignoring return value of type 'S3' declared with 'nodiscard' attribute: Wrong}}
   get_i(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  get_e(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  get_e(); // expected-warning {{ignoring return value of type 'E2' declared with 'nodiscard' attribute}}
 
   // Okay, warnings are not encouraged
   (void)get_s();
@@ -50,7 +50,7 @@ struct [[nodiscard]] error_info{
 struct error_info enable_missile_safety_mode(void);
 void launch_missiles(void);
 void test_missiles(void) {
-  enable_missile_safety_mode(); // expected-warning {{ignoring return value of function declared with 'nodiscard'}}
+  enable_missile_safety_mode(); // expected-warning {{ignoring return value of type 'error_info' declared with 'nodiscard'}}
   launch_missiles();
 }
 
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index d15c587cfffc4..7063c290479f6 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -745,3 +745,35 @@ constexpr long long reduceAddLong2 = __builtin_reduce_add((vector4long){(1LL <<
 // expected-note@-1 {{outside the range of representable values of type 'long long'}}
 static_assert(__builtin_reduce_add((vector4uint){~0U, 0, 0, 1}) == 0);
 static_assert(__builtin_reduce_add((vector4ulong){~0ULL, 0, 0, 1}) == 0);
+
+static_assert(__builtin_reduce_mul((vector4char){}) == 0);
+static_assert(__builtin_reduce_mul((vector4char){1, 2, 3, 4}) == 24);
+static_assert(__builtin_reduce_mul((vector4short){1, 2, 30, 40}) == 2400);
+static_assert(__builtin_reduce_mul((vector4int){10, 20, 300, 400}) == 24000000);
+static_assert(__builtin_reduce_mul((vector4long){1000L, 2000L, 3000L, 4000L}) == 24000000000000L);
+constexpr int reduceMulInt1 = __builtin_reduce_mul((vector4int){~(1 << 31), 1, 1, 2});
+// expected-error@-1 {{must be initialized by a constant expression}} \
+// expected-note@-1 {{outside the range of representable values of type 'int'}}
+constexpr long long reduceMulLong1 = __builtin_reduce_mul((vector4long){~(1LL << 63), 1, 1, 2});
+// expected-error@-1 {{must be initialized by a constant expression}} \
+// expected-note@-1 {{outside the range of representable values of type 'long long'}}
+constexpr int reduceMulInt2 = __builtin_reduce_mul((vector4int){(1 << 31), 1, 1, 2});
+// expected-error@-1 {{must be initialized by a constant expression}} \
+// expected-note@-1 {{outside the range of representable values of type 'int'}}
+constexpr long long reduceMulLong2 = __builtin_reduce_mul((vector4long){(1LL << 63), 1, 1, 2});
+// expected-error@-1 {{must be initialized by a constant expression}} \
+// expected-note@-1 {{outside the range of representable values of type 'long long'}}
+static_assert(__builtin_reduce_mul((vector4uint){~0U, 1, 1, 2}) == ~0U - 1);
+static_assert(__builtin_reduce_mul((vector4ulong){~0ULL, 1, 1, 2}) == ~0ULL - 1);
+
+static_assert(__builtin_reduce_and((vector4char){}) == 0);
+static_assert(__builtin_reduce_and((vector4char){(char)0x11, (char)0x22, (char)0x44, (char)0x88}) == 0);
+static_assert(__builtin_reduce_and((vector4short){(short)0x1111, (short)0x2222, (short)0x4444, (short)0x8888}) == 0);
+static_assert(__builtin_reduce_and((vector4int){(int)0x11111111, (int)0x22222222, (int)0x44444444, (int)0x88888888}) == 0);
+static_assert(__builtin_reduce_and((vector4long){(long long)0x1111111111111111L, (long long)0x2222222222222222L, (long long)0x4444444444444444L, (long long)0x8888888888888888L}) == 0L);
+static_assert(__builtin_reduce_and((vector4char){(char)-1, (char)~0x22, (char)~0x44, (char)~0x88}) == 0x11);
+static_assert(__builtin_reduce_and((vector4short){(short)~0x1111, (short)-1, (short)~0x4444, (short)~0x8888}) == 0x2222);
+static_assert(__builtin_reduce_and((vector4int){(int)~0x11111111, (int)~0x22222222, (int)-1, (int)~0x88888888}) == 0x44444444);
+static_assert(__builtin_reduce_and((vector4long){(long long)~0x1111111111111111L, (long long)~0x2222222222222222L, (long long)~0x4444444444444444L, (long long)-1}) == 0x8888888888888888L);
+static_assert(__builtin_reduce_and((vector4uint){0x11111111U, 0x22222222U, 0x44444444U, 0x88888888U}) == 0U);
+static_assert(__builtin_reduce_and((vector4ulong){0x1111111111111111UL, 0x2222222222222222UL, 0x4444444444444444UL, 0x8888888888888888UL}) == 0L);
diff --git a/clang/test/Sema/pragma-attribute-strict-subjects.c b/clang/test/Sema/pragma-attribute-strict-subjects.c
index 4f37c271ce34a..85b484799529a 100644
--- a/clang/test/Sema/pragma-attribute-strict-subjects.c
+++ b/clang/test/Sema/pragma-attribute-strict-subjects.c
@@ -52,16 +52,16 @@
 #pragma clang attribute pop
 
 #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(function, record(unless(is_union)), variable, enum))
-// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}}
+// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}}
 #pragma clang attribute pop
 
 #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(enum_constant, function, record(unless(is_union)), variable, variable(is_parameter), enum))
 // FIXME: comma in this diagnostic is wrong.
-// expected-error@-2 {{attribute 'abi_tag' can't be applied to 'enum_constant', and 'enum'}}
+// expected-error@-2 {{attribute 'abi_tag' cannot be applied to 'enum_constant', and 'enum'}}
 #pragma clang attribute pop
 
 #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(function, record(unless(is_union)), enum))
-// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}}
+// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}}
 #pragma clang attribute pop
 
 // Verify the non-strict subject set verification.
@@ -96,12 +96,12 @@
 
 
 #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = any(record(unless(is_union)), function, variable, enum, enum_constant))
-// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum_constant', and 'enum'}}
+// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum_constant', and 'enum'}}
 
 #pragma clang attribute pop
 
 #pragma clang attribute push (__attribute__((abi_tag("a"))), apply_to = enum)
-// expected-error@-1 {{attribute 'abi_tag' can't be applied to 'enum'}}
+// expected-error@-1 {{attribute 'abi_tag' cannot be applied to 'enum'}}
 
 #pragma clang attribute pop
 
@@ -124,21 +124,21 @@
 #pragma clang attribute pop
 
 #pragma clang attribute push(__attribute__((objc_subclassing_restricted)), apply_to = any(objc_interface, objc_protocol))
-// expected-error@-1 {{attribute 'objc_subclassing_restricted' can't be applied to 'objc_protocol'}}
+// expected-error@-1 {{attribute 'objc_subclassing_restricted' cannot be applied to 'objc_protocol'}}
 #pragma clang attribute pop
 
 #pragma clang attribute push(__attribute__((objc_subclassing_restricted)), apply_to = any(objc_protocol))
-// expected-error@-1 {{attribute 'objc_subclassing_restricted' can't be applied to 'objc_protocol'}}
+// expected-error@-1 {{attribute 'objc_subclassing_restricted' cannot be applied to 'objc_protocol'}}
 // Don't report an error about missing 'objc_interface' as we aren't parsing
 // Objective-C.
 #pragma clang attribute pop
 
 #pragma clang attribute push(__attribute__((objc_subclassing_restricted)), apply_to = any(objc_interface, objc_protocol))
-// expected-error@-1 {{attribute 'objc_subclassing_restricted' can't be applied to 'objc_protocol'}}
+// expected-error@-1 {{attribute 'objc_subclassing_restricted' cannot be applied to 'objc_protocol'}}
 #pragma clang attribute pop
 
 #pragma clang attribute push(__attribute__((objc_subclassing_restricted)), apply_to = any(objc_protocol))
-// expected-error@-1 {{attribute 'objc_subclassing_restricted' can't be applied to 'objc_protocol'}}
+// expected-error@-1 {{attribute 'objc_subclassing_restricted' cannot be applied to 'objc_protocol'}}
 // Don't report an error about missing 'objc_interface' as we aren't parsing
 // Objective-C.
 #pragma clang attribute pop
diff --git a/clang/test/Sema/warn-lifetime-analysis-capture-by.cpp b/clang/test/Sema/warn-lifetime-analysis-capture-by.cpp
new file mode 100644
index 0000000000000..b3fde386b8616
--- /dev/null
+++ b/clang/test/Sema/warn-lifetime-analysis-capture-by.cpp
@@ -0,0 +1,368 @@
+// RUN: %clang_cc1 --std=c++20 -fsyntax-only -verify -Wdangling-capture %s
+
+#include "Inputs/lifetime-analysis.h"
+
+// ****************************************************************************
+// Capture an integer
+// ****************************************************************************
+namespace capture_int {
+struct X {} x;
+void captureInt(const int &i [[clang::lifetime_capture_by(x)]], X &x);
+void captureRValInt(int &&i [[clang::lifetime_capture_by(x)]], X &x);
+void noCaptureInt(int i [[clang::lifetime_capture_by(x)]], X &x);
+
+void use() {
+  int local;
+  captureInt(1, // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+            x);
+  captureRValInt(1, x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureInt(local, x);
+  noCaptureInt(1, x);
+  noCaptureInt(local, x);
+}
+} // namespace capture_int
+
+// ****************************************************************************
+// Capture std::string (gsl owner types)
+// ****************************************************************************
+namespace capture_string {
+struct X {} x;
+void captureString(const std::string &s [[clang::lifetime_capture_by(x)]], X &x);
+void captureRValString(std::string &&s [[clang::lifetime_capture_by(x)]], X &x);
+
+void use() {
+  std::string local_string;
+  captureString(std::string(), x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureString(local_string, x);
+  captureRValString(std::move(local_string), x);
+  captureRValString(std::string(), x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+}
+} // namespace capture_string
+
+// ****************************************************************************
+// Capture std::string_view (gsl pointer types)
+// ****************************************************************************
+namespace capture_string_view {
+struct X {} x;
+void captureStringView(std::string_view s [[clang::lifetime_capture_by(x)]], X &x);
+void captureRValStringView(std::string_view &&sv [[clang::lifetime_capture_by(x)]], X &x);
+void noCaptureStringView(std::string_view sv, X &x);
+
+std::string_view getLifetimeBoundView(const std::string& s [[clang::lifetimebound]]);
+std::string_view getNotLifetimeBoundView(const std::string& s);
+const std::string& getLifetimeBoundString(const std::string &s [[clang::lifetimebound]]);
+const std::string& getLifetimeBoundString(std::string_view sv [[clang::lifetimebound]]);
+
+void use() {
+  std::string_view local_string_view;
+  std::string local_string;
+  captureStringView(local_string_view, x);
+  captureStringView(std::string(), // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+            x);
+
+  captureStringView(getLifetimeBoundView(local_string), x);
+  captureStringView(getNotLifetimeBoundView(std::string()), x);
+  captureRValStringView(std::move(local_string_view), x);
+  captureRValStringView(std::string(), x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureRValStringView(std::string_view{"abcd"}, x);
+
+  noCaptureStringView(local_string_view, x);
+  noCaptureStringView(std::string(), x);
+
+  // With lifetimebound functions.
+  captureStringView(getLifetimeBoundView(
+  std::string() // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  ), x);
+  captureRValStringView(getLifetimeBoundView(local_string), x);
+  captureRValStringView(getLifetimeBoundView(std::string()), x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureRValStringView(getNotLifetimeBoundView(std::string()), x);
+  noCaptureStringView(getLifetimeBoundView(std::string()), x);
+  captureStringView(getLifetimeBoundString(std::string()), x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureStringView(getLifetimeBoundString(getLifetimeBoundView(std::string())), x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureStringView(getLifetimeBoundString(getLifetimeBoundString(
+    std::string()  // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+    )), x);
+}
+} // namespace capture_string_view
+
+// ****************************************************************************
+// Capture pointer (eg: std::string*)
+// ****************************************************************************
+const std::string* getLifetimeBoundPointer(const std::string &s [[clang::lifetimebound]]);
+const std::string* getNotLifetimeBoundPointer(const std::string &s);
+
+namespace capture_pointer {
+struct X {} x;
+void capturePointer(const std::string* sp [[clang::lifetime_capture_by(x)]], X &x);
+void use() {
+  capturePointer(getLifetimeBoundPointer(std::string()), x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  capturePointer(getLifetimeBoundPointer(*getLifetimeBoundPointer(
+    std::string()  // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+    )), x);
+  capturePointer(getNotLifetimeBoundPointer(std::string()), x);
+
+}
+} // namespace capture_pointer
+
+// ****************************************************************************
+// Arrays and initializer lists.
+// ****************************************************************************
+namespace init_lists {
+struct X {} x;
+void captureVector(const std::vector<int> &a [[clang::lifetime_capture_by(x)]], X &x);
+void captureArray(int array [[clang::lifetime_capture_by(x)]] [2], X &x);
+void captureInitList(std::initializer_list<int> abc [[clang::lifetime_capture_by(x)]], X &x);
+
+
+std::initializer_list<int> getLifetimeBoundInitList(std::initializer_list<int> abc [[clang::lifetimebound]]);
+
+void use() {
+  captureVector({1, 2, 3}, x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureVector(std::vector<int>{}, x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  std::vector<int> local_vector;
+  captureVector(local_vector, x);
+  int local_array[2]; 
+  captureArray(local_array, x);
+  captureInitList({1, 2}, x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureInitList(getLifetimeBoundInitList({1, 2}), x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+}
+} // namespace init_lists
+
+// ****************************************************************************
+// Implicit object param 'this' is captured
+// ****************************************************************************
+namespace this_is_captured {
+struct X {} x;
+struct S {
+  void capture(X &x) [[clang::lifetime_capture_by(x)]];
+};
+void use() {
+  S{}.capture(x); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  S s;
+  s.capture(x);
+}
+} // namespace this_is_captured
+
+// ****************************************************************************
+// Capture by Global and Unknown.
+// ****************************************************************************
+namespace capture_by_global_unknown {
+void captureByGlobal(std::string_view s [[clang::lifetime_capture_by(global)]]);
+void captureByUnknown(std::string_view s [[clang::lifetime_capture_by(unknown)]]);
+
+std::string_view getLifetimeBoundView(const std::string& s [[clang::lifetimebound]]);
+
+void use() {  
+  std::string_view local_string_view;
+  std::string local_string;
+  // capture by global.
+  captureByGlobal(std::string()); // expected-warning {{object whose reference is captured will be destroyed at the end of the full-expression}}
+  captureByGlobal(getLifetimeBoundView(std::string())); // expected-warning {{object whose reference is captured will be destroyed at the end of the full-expression}}
+  captureByGlobal(local_string);
+  captureByGlobal(local_string_view);
+
+  // capture by unknown.
+  captureByUnknown(std::string()); // expected-warning {{object whose reference is captured will be destroyed at the end of the full-expression}}
+  captureByUnknown(getLifetimeBoundView(std::string())); // expected-warning {{object whose reference is captured will be destroyed at the end of the full-expression}}
+  captureByUnknown(local_string);
+  captureByUnknown(local_string_view);
+}
+} // namespace capture_by_global_unknown
+
+// ****************************************************************************
+// Member functions: Capture by 'this'
+// ****************************************************************************
+namespace capture_by_this {
+struct S {
+  void captureInt(const int& x [[clang::lifetime_capture_by(this)]]);
+  void captureView(std::string_view sv [[clang::lifetime_capture_by(this)]]);
+};
+std::string_view getLifetimeBoundView(const std::string& s [[clang::lifetimebound]]);
+std::string_view getNotLifetimeBoundView(const std::string& s);
+const std::string& getLifetimeBoundString(const std::string &s [[clang::lifetimebound]]);
+
+void use() {
+  S s;
+  s.captureInt(1); // expected-warning {{object whose reference is captured by 's' will be destroyed at the end of the full-expression}}
+  s.captureView(std::string()); // expected-warning {{object whose reference is captured by 's' will be destroyed at the end of the full-expression}}
+  s.captureView(getLifetimeBoundView(std::string())); // expected-warning {{object whose reference is captured by 's' will be destroyed at the end of the full-expression}}
+  s.captureView(getLifetimeBoundString(std::string()));  // expected-warning {{object whose reference is captured by 's' will be destroyed at the end of the full-expression}}
+  s.captureView(getNotLifetimeBoundView(std::string()));
+}  
+} // namespace capture_by_this
+
+// ****************************************************************************
+// Struct with field as a reference
+// ****************************************************************************
+namespace reference_field {
+struct X {} x;
+struct Foo {
+  const int& b;
+};
+void captureField(Foo param [[clang::lifetime_capture_by(x)]], X &x);
+void use() {
+  captureField(Foo{
+    1 // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  }, x);
+  int local;
+  captureField(Foo{local}, x);
+}
+} // namespace reference_field
+
+// ****************************************************************************
+// Capture default argument.
+// ****************************************************************************
+namespace default_arg {
+struct X {} x;
+void captureDefaultArg(X &x, std::string_view s [[clang::lifetime_capture_by(x)]] = std::string());
+
+std::string_view getLifetimeBoundView(const std::string& s [[clang::lifetimebound]]);
+
+void useCaptureDefaultArg() {
+  X x;
+  captureDefaultArg(x); // FIXME: Diagnose temporary default arg.
+  captureDefaultArg(x, std::string("temp")); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  captureDefaultArg(x, getLifetimeBoundView(std::string())); // expected-warning {{object whose reference is captured by 'x' will be destroyed at the end of the full-expression}}
+  std::string local;
+  captureDefaultArg(x, local);
+}
+} // namespace default_arg 
+
+// ****************************************************************************
+// Container: *No* distinction between pointer-like and other element type
+// ****************************************************************************
+namespace containers_no_distinction {
+template<class T>
+struct MySet {
+  void insert(T&& t [[clang::lifetime_capture_by(this)]]);
+  void insert(const T& t [[clang::lifetime_capture_by(this)]]);
+};
+void user_defined_containers() {
+  MySet<int> set_of_int;
+  set_of_int.insert(1); // expected-warning {{object whose reference is captured by 'set_of_int' will be destroyed at the end of the full-expression}}
+  MySet<std::string_view> set_of_sv;
+  set_of_sv.insert(std::string());  // expected-warning {{object whose reference is captured by 'set_of_sv' will be destroyed at the end of the full-expression}}
+  set_of_sv.insert(std::string_view());
+}
+} // namespace containers_no_distinction
+
+// ****************************************************************************
+// Container: Different for pointer-like and other element type.
+// ****************************************************************************
+namespace conatiners_with_different {
+template<typename T> struct IsPointerLikeTypeImpl : std::false_type {};
+template<> struct IsPointerLikeTypeImpl<std::string_view> : std::true_type {};
+template<typename T> concept IsPointerLikeType = std::is_pointer<T>::value || IsPointerLikeTypeImpl<T>::value;
+
+template<class T> struct MyVector {
+  void push_back(T&& t [[clang::lifetime_capture_by(this)]]) requires IsPointerLikeType<T>;
+  void push_back(const T& t [[clang::lifetime_capture_by(this)]]) requires IsPointerLikeType<T>;
+
+  void push_back(T&& t) requires (!IsPointerLikeType<T>);
+  void push_back(const T& t) requires (!IsPointerLikeType<T>);
+};
+
+std::string_view getLifetimeBoundView(const std::string& s [[clang::lifetimebound]]);
+
+void use_container() {
+  std::string local;
+
+  MyVector<std::string> vector_of_string;
+  vector_of_string.push_back(std::string()); // Ok.
+  
+  MyVector<std::string_view> vector_of_view;
+  vector_of_view.push_back(std::string()); // expected-warning {{object whose reference is captured by 'vector_of_view' will be destroyed at the end of the full-expression}}
+  vector_of_view.push_back(getLifetimeBoundView(std::string())); // expected-warning {{object whose reference is captured by 'vector_of_view' will be destroyed at the end of the full-expression}}
+  
+  MyVector<const std::string*> vector_of_pointer;
+  vector_of_pointer.push_back(getLifetimeBoundPointer(std::string())); // expected-warning {{object whose reference is captured by 'vector_of_pointer' will be destroyed at the end of the full-expression}}
+  vector_of_pointer.push_back(getLifetimeBoundPointer(*getLifetimeBoundPointer(std::string()))); // expected-warning {{object whose reference is captured by 'vector_of_pointer' will be destroyed at the end of the full-expression}}
+  vector_of_pointer.push_back(getLifetimeBoundPointer(local));
+  vector_of_pointer.push_back(getNotLifetimeBoundPointer(std::string()));
+}
+
+// ****************************************************************************
+// Container: For user defined view types
+// ****************************************************************************
+struct [[gsl::Pointer()]] MyStringView : public std::string_view {
+  MyStringView();
+  MyStringView(std::string_view&&);
+  MyStringView(const MyStringView&);
+  MyStringView(const std::string&);
+};
+template<> struct IsPointerLikeTypeImpl<MyStringView> : std::true_type {};
+
+std::optional<std::string_view> getOptionalSV();
+std::optional<std::string> getOptionalS();
+std::optional<MyStringView> getOptionalMySV();
+MyStringView getMySV();
+
+class MyStringViewNotPointer : public std::string_view {};
+std::optional<MyStringViewNotPointer> getOptionalMySVNotP();
+MyStringViewNotPointer getMySVNotP();
+
+std::string_view getLifetimeBoundView(const std::string& s [[clang::lifetimebound]]);
+std::string_view getNotLifetimeBoundView(const std::string& s);
+const std::string& getLifetimeBoundString(const std::string &s [[clang::lifetimebound]]);
+const std::string& getLifetimeBoundString(std::string_view sv [[clang::lifetimebound]]);
+
+void use_my_view() {
+  std::string local;
+  MyVector<MyStringView> vector_of_my_view;
+  vector_of_my_view.push_back(getMySV());
+  vector_of_my_view.push_back(MyStringView{});
+  vector_of_my_view.push_back(std::string_view{});
+  vector_of_my_view.push_back(std::string{}); // expected-warning {{object whose reference is captured by 'vector_of_my_view' will be destroyed at the end of the full-expression}}
+  vector_of_my_view.push_back(getLifetimeBoundView(std::string{})); // expected-warning {{object whose reference is captured by 'vector_of_my_view' will be destroyed at the end of the full-expression}}
+  vector_of_my_view.push_back(getLifetimeBoundString(getLifetimeBoundView(std::string{}))); // expected-warning {{object whose reference is captured by 'vector_of_my_view' will be destroyed at the end of the full-expression}}
+  vector_of_my_view.push_back(getNotLifetimeBoundView(getLifetimeBoundString(getLifetimeBoundView(std::string{}))));
+  
+  // Use with container of other view types.
+  MyVector<std::string_view> vector_of_view;
+  vector_of_view.push_back(getMySV());
+  vector_of_view.push_back(getMySVNotP());
+}
+
+// ****************************************************************************
+// Container: Use with std::optional<view> (owner<pointer> types)
+// ****************************************************************************
+void use_with_optional_view() {
+  MyVector<std::string_view> vector_of_view;
+
+  std::optional<std::string_view> optional_of_view;
+  vector_of_view.push_back(optional_of_view.value());
+  vector_of_view.push_back(getOptionalS().value()); // expected-warning {{object whose reference is captured by 'vector_of_view' will be destroyed at the end of the full-expression}}
+  
+  vector_of_view.push_back(getOptionalSV().value());
+  vector_of_view.push_back(getOptionalMySV().value());
+  vector_of_view.push_back(getOptionalMySVNotP().value());
+}
+} // namespace conatiners_with_different
+
+// ****************************************************************************
+// Capture 'temporary' views
+// ****************************************************************************
+namespace temporary_views {
+void capture1(std::string_view s [[clang::lifetime_capture_by(x)]], std::vector<std::string_view>& x);
+
+// Intended to capture the "string_view" itself
+void capture2(const std::string_view& s [[clang::lifetime_capture_by(x)]], std::vector<std::string_view*>& x);
+// Intended to capture the pointee of the "string_view"
+void capture3(const std::string_view& s [[clang::lifetime_capture_by(x)]], std::vector<std::string_view>& x);
+
+void use() {
+  std::vector<std::string_view> x1;
+  capture1(std::string(), x1); // expected-warning {{object whose reference is captured by 'x1' will be destroyed at the end of the full-expression}}
+  capture1(std::string_view(), x1);
+
+  std::vector<std::string_view*> x2;
+  // Clang considers 'const std::string_view&' to refer to the owner
+  // 'std::string' and not 'std::string_view'. Therefore no diagnostic here.
+  capture2(std::string_view(), x2);
+  capture2(std::string(), x2); // expected-warning {{object whose reference is captured by 'x2' will be destroyed at the end of the full-expression}}
+  
+  std::vector<std::string_view> x3;
+  capture3(std::string_view(), x3);
+  capture3(std::string(), x3); // expected-warning {{object whose reference is captured by 'x3' will be destroyed at the end of the full-expression}}
+}
+} // namespace temporary_views
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 6a2af01ea5116..c18ecd86ad06f 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -Wdangling -Wdangling-field -Wreturn-stack-address -verify %s
+#include "Inputs/lifetime-analysis.h"
 struct [[gsl::Owner(int)]] MyIntOwner {
   MyIntOwner();
   int &operator*();
@@ -129,130 +130,6 @@ void initLocalGslPtrWithTempOwner() {
   global2 = MyLongOwnerWithConversion{}; // expected-warning {{object backing the pointer global2 }}
 }
 
-namespace __gnu_cxx {
-template <typename T>
-struct basic_iterator {
-  basic_iterator operator++();
-  T& operator*() const;
-  T* operator->() const;
-};
-
-template<typename T>
-bool operator!=(basic_iterator<T>, basic_iterator<T>);
-}
-
-namespace std {
-template<typename T> struct remove_reference       { typedef T type; };
-template<typename T> struct remove_reference<T &>  { typedef T type; };
-template<typename T> struct remove_reference<T &&> { typedef T type; };
-
-template<typename T>
-typename remove_reference<T>::type &&move(T &&t) noexcept;
-
-template <typename C>
-auto data(const C &c) -> decltype(c.data());
-
-template <typename C>
-auto begin(C &c) -> decltype(c.begin());
-
-template<typename T, int N>
-T *begin(T (&array)[N]);
-
-using size_t = decltype(sizeof(0));
-
-template<typename T>
-struct initializer_list {
-  const T* ptr; size_t sz;
-};
-template<typename T> class allocator {};
-template <typename T, typename Alloc = allocator<T>>
-struct vector {
-  typedef __gnu_cxx::basic_iterator<T> iterator;
-  iterator begin();
-  iterator end();
-  const T *data() const;
-  vector();
-  vector(initializer_list<T> __l,
-         const Alloc& alloc = Alloc());
-
-  template<typename InputIterator>
-	vector(InputIterator first, InputIterator __last);
-
-  T &at(int n);
-};
-
-template<typename T>
-struct basic_string_view {
-  basic_string_view();
-  basic_string_view(const T *);
-  const T *begin() const;
-};
-using string_view = basic_string_view<char>;
-
-template<class _Mystr> struct iter {
-    iter& operator-=(int);
-
-    iter operator-(int _Off) const {
-        iter _Tmp = *this;
-        return _Tmp -= _Off;
-    }
-};
-
-template<typename T>
-struct basic_string {
-  basic_string();
-  basic_string(const T *);
-  const T *c_str() const;
-  operator basic_string_view<T> () const;
-  using const_iterator = iter<T>;
-};
-using string = basic_string<char>;
-
-template<typename T>
-struct unique_ptr {
-  T &operator*();
-  T *get() const;
-};
-
-template<typename T>
-struct optional {
-  optional();
-  optional(const T&);
-
-  template<typename U = T>
-  optional(U&& t);
-
-  template<typename U>
-  optional(optional<U>&& __t);
-
-  T &operator*() &;
-  T &&operator*() &&;
-  T &value() &;
-  T &&value() &&;
-};
-template<typename T>
-optional<__decay(T)> make_optional(T&&);
-
-
-template<typename T>
-struct stack {
-  T &top();
-};
-
-struct any {};
-
-template<typename T>
-T any_cast(const any& operand);
-
-template<typename T>
-struct reference_wrapper {
-  template<typename U>
-  reference_wrapper(U &&);
-};
-
-template<typename T>
-reference_wrapper<T> ref(T& t) noexcept;
-}
 
 struct Unannotated {
   typedef std::vector<int>::iterator iterator;
diff --git a/clang/test/SemaCUDA/spirv-attrs.cu b/clang/test/SemaCUDA/spirv-attrs.cu
new file mode 100644
index 0000000000000..6539421423ee1
--- /dev/null
+++ b/clang/test/SemaCUDA/spirv-attrs.cu
@@ -0,0 +1,18 @@
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -triple spirv64 -aux-triple x86_64-unknown-linux-gnu \
+// RUN:   -fcuda-is-device -verify -fsyntax-only %s
+
+#include "Inputs/cuda.h"
+
+__attribute__((reqd_work_group_size(128, 1, 1)))
+__global__ void reqd_work_group_size_128_1_1() {}
+
+__attribute__((work_group_size_hint(2, 2, 2)))
+__global__ void work_group_size_hint_2_2_2() {}
+
+__attribute__((vec_type_hint(int)))
+__global__ void vec_type_hint_int() {}
+
+__attribute__((intel_reqd_sub_group_size(64)))
+__global__ void intel_reqd_sub_group_size_64() {}
diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index 81e9193cf76a0..f89b556f5bba0 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++23 -verify %s
 
 namespace usage_invalid {
-  void void_return(int &param [[clang::lifetimebound]]); // expected-error {{'lifetimebound' attribute cannot be applied to a parameter of a function that returns void}}
+  void void_return(int &param [[clang::lifetimebound]]); // expected-error {{'lifetimebound' attribute cannot be applied to a parameter of a function that returns void; did you mean 'lifetime_capture_by(X)'}}
 
   int *not_class_member() [[clang::lifetimebound]]; // expected-error {{non-member function has no implicit object parameter}}
   struct A {
@@ -11,7 +11,7 @@ namespace usage_invalid {
     int *explicit_object(this A&) [[clang::lifetimebound]]; // expected-error {{explicit object member function has no implicit object parameter}}
     int not_function [[clang::lifetimebound]]; // expected-error {{only applies to parameters and implicit object parameters}}
     int [[clang::lifetimebound]] also_not_function; // expected-error {{cannot be applied to types}}
-    void void_return_member() [[clang::lifetimebound]]; // expected-error {{'lifetimebound' attribute cannot be applied to an implicit object parameter of a function that returns void}}
+    void void_return_member() [[clang::lifetimebound]]; // expected-error {{'lifetimebound' attribute cannot be applied to an implicit object parameter of a function that returns void; did you mean 'lifetime_capture_by(X)'}}
   };
   int *attr_with_param(int &param [[clang::lifetimebound(42)]]); // expected-error {{takes no arguments}}
 }
diff --git a/clang/test/SemaCXX/builtin-bit-cast.cpp b/clang/test/SemaCXX/builtin-bit-cast.cpp
index d7f24c7939b55..8717371b941b0 100644
--- a/clang/test/SemaCXX/builtin-bit-cast.cpp
+++ b/clang/test/SemaCXX/builtin-bit-cast.cpp
@@ -32,10 +32,10 @@ struct not_trivially_copyable {
   virtual void foo() {}
 };
 
-// expected-error@+1{{__builtin_bit_cast source type must be trivially copyable}}
+// expected-error@+1{{'__builtin_bit_cast' source type must be trivially copyable}}
 constexpr unsigned long ul = __builtin_bit_cast(unsigned long, not_trivially_copyable{});
 
-// expected-error@+1 {{__builtin_bit_cast destination type must be trivially copyable}}
+// expected-error@+1 {{'__builtin_bit_cast' destination type must be trivially copyable}}
 constexpr long us = __builtin_bit_cast(unsigned long &, 0L);
 
 namespace PR42936 {
diff --git a/clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp b/clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp
index 25d1f8df7f716..19e7d4976428a 100644
--- a/clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp
+++ b/clang/test/SemaCXX/constexpr-return-non-void-cxx2b.cpp
@@ -1,7 +1,36 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -Wimplicit-fallthrough -verify %s
 
 constexpr int f() { } // expected-warning {{non-void function does not return a value}}
 static_assert(__is_same(decltype([] constexpr -> int { }( )), int)); // expected-warning {{non-void lambda does not return a value}}
 
 consteval int g() { } // expected-warning {{non-void function does not return a value}}
 static_assert(__is_same(decltype([] consteval -> int { }( )), int)); // expected-warning {{non-void lambda does not return a value}}
+
+namespace GH116485 {
+int h() {
+    if consteval { }
+} // expected-warning {{non-void function does not return a value}}
+
+void i(int x) {
+  if consteval {
+  }
+  switch (x) {
+  case 1:
+    i(1);
+  case 2: // expected-warning {{unannotated fall-through between switch labels}} \
+          // expected-note {{insert 'break;' to avoid fall-through}}
+    break;
+  }
+}
+
+constexpr bool j()  {
+    if !consteval { return true; }
+} // expected-warning {{non-void function does not return a value in all control paths}} \
+  // expected-note {{control reached end of constexpr function}}
+
+bool k = j();
+constinit bool l = j(); // expected-error {{variable does not have a constant initializer}} \
+                        // expected-note {{required by 'constinit' specifier here}} \
+                        // expected-note {{in call to 'j()'}}
+
+}
diff --git a/clang/test/SemaCXX/warn-shadow.cpp b/clang/test/SemaCXX/warn-shadow.cpp
index 2969bd39fed41..98a235a73c7e5 100644
--- a/clang/test/SemaCXX/warn-shadow.cpp
+++ b/clang/test/SemaCXX/warn-shadow.cpp
@@ -307,3 +307,17 @@ void test4() {
 }
 
 }; // namespace structured_binding_tests
+
+namespace GH62588 {
+class Outer {
+public:
+  char *foo();          // expected-note {{previous declaration is here}} \
+                        // expected-note {{previous definition is here}}
+  enum Outer_E { foo }; // expected-error {{redefinition of 'foo'}} \
+                        // expected-warning {{declaration shadows a static data member of 'GH62588::Outer'}}
+  class Inner {
+  public:
+    enum Inner_E { foo }; // ok
+  };
+};
+} // namespace GH62588
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
index 0228e42652bd9..a6a334d247023 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
@@ -173,4 +173,21 @@ A false_negatives(std::span<int> span_pt, span<A> span_A) {
   return *a2; // TODO: Can cause OOB if span_pt is empty
 
 }
+
+void test_incomplete_type(std::span<char> S) {
+  (struct IncompleteStruct *)S.data(); // expected-warning{{unsafe invocation of 'data'}}
+  (class IncompleteClass *)S.data();   // expected-warning{{unsafe invocation of 'data'}}
+  (union IncompleteUnion *)S.data();   // expected-warning{{unsafe invocation of 'data'}}
+}
+
+void test_complete_type(std::span<long> S) {
+  (struct CompleteStruct *)S.data(); // no warn as the struct size is smaller than long
+  (class CompleteClass *)S.data();   // no warn as the class size is smaller than long
+  (union CompleteUnion *)S.data();   // no warn as the union size is smaller than long
+
+  struct CompleteStruct {};
+  class CompleteClass {};
+  union CompleteUnion {};
+}
+
 #endif
diff --git a/clang/test/SemaCXX/warn-unused-result.cpp b/clang/test/SemaCXX/warn-unused-result.cpp
index 4b7a2503ecc0d..682c500dc1d96 100644
--- a/clang/test/SemaCXX/warn-unused-result.cpp
+++ b/clang/test/SemaCXX/warn-unused-result.cpp
@@ -108,7 +108,7 @@ void lazy() {
   (void)DoAnotherThing();
   (void)DoYetAnotherThing();
 
-  DoSomething(); // expected-warning {{ignoring return value}}
+  DoSomething(); // expected-warning {{ignoring return value of type 'Status' declared with 'warn_unused_result'}}
   DoSomethingElse();
   DoAnotherThing();
   DoYetAnotherThing();
@@ -120,11 +120,11 @@ class [[clang::warn_unused_result]] StatusOr {
 StatusOr<int> doit();
 void test() {
   Foo f;
-  f.doStuff(); // expected-warning {{ignoring return value}}
-  doit(); // expected-warning {{ignoring return value}}
+  f.doStuff(); // expected-warning {{ignoring return value of type 'Status' declared with 'warn_unused_result'}}
+  doit(); // expected-warning {{ignoring return value of type 'StatusOr<int>' declared with 'warn_unused_result'}}
 
   auto func = []() { return Status(); };
-  func(); // expected-warning {{ignoring return value}}
+  func(); // expected-warning {{ignoring return value of type 'Status' declared with 'warn_unused_result'}}
 }
 }
 
@@ -139,7 +139,7 @@ struct Status {};
 
 void Bar() {
   Foo f;
-  f.Bar(); // expected-warning {{ignoring return value}}
+  f.Bar(); // expected-warning {{ignoring return value of type 'Status' declared with 'warn_unused_result'}}
 };
 
 }
@@ -215,18 +215,18 @@ P operator--(const P &) { return {}; };
 void f() {
   S s;
   P p;
-  s.DoThing(); // expected-warning {{ignoring return value}}
-  p.DoThing(); // expected-warning {{ignoring return value}}
+  s.DoThing(); // expected-warning {{ignoring return value of type 'S' declared with 'warn_unused_result'}}
+  p.DoThing(); // expected-warning {{ignoring return value of type 'P' declared with 'warn_unused_result'}}
   // Only postfix is expected to warn when written correctly.
-  s++; // expected-warning {{ignoring return value}}
-  s--; // expected-warning {{ignoring return value}}
-  p++; // expected-warning {{ignoring return value}}
-  p--; // expected-warning {{ignoring return value}}
+  s++; // expected-warning {{ignoring return value of type 'S' declared with 'warn_unused_result'}}
+  s--; // expected-warning {{ignoring return value of type 'S' declared with 'warn_unused_result'}}
+  p++; // expected-warning {{ignoring return value of type 'P' declared with 'warn_unused_result'}}
+  p--; // expected-warning {{ignoring return value of type 'P' declared with 'warn_unused_result'}}
   // Improperly written prefix operators should still warn.
-  ++s; // expected-warning {{ignoring return value}}
-  --s; // expected-warning {{ignoring return value}}
-  ++p; // expected-warning {{ignoring return value}}
-  --p; // expected-warning {{ignoring return value}}
+  ++s; // expected-warning {{ignoring return value of type 'S' declared with 'warn_unused_result'}}
+  --s; // expected-warning {{ignoring return value of type 'S' declared with 'warn_unused_result'}}
+  ++p; // expected-warning {{ignoring return value of type 'P' declared with 'warn_unused_result'}}
+  --p; // expected-warning {{ignoring return value of type 'P' declared with 'warn_unused_result'}}
 
   // Silencing the warning by cast to void still works.
   (void)s.DoThing();
@@ -243,7 +243,7 @@ namespace PR39837 {
 void g() {
   int a[2];
   for (int b : a)
-    f(b); // expected-warning {{ignoring return value}}
+    f(b); // expected-warning {{ignoring return value of function declared with 'warn_unused_result'}}
 }
 } // namespace PR39837
 
@@ -261,12 +261,12 @@ typedef a indirect;
 a af1();
 indirect indirectf1();
 void af2() {
-  af1(); // expected-warning {{ignoring return value}}
+  af1(); // expected-warning {{ignoring return value of type 'a' declared with 'warn_unused_result'}}
   void *(*a1)();
   a1(); // no warning
   a (*a2)();
-  a2(); // expected-warning {{ignoring return value}}
-  indirectf1(); // expected-warning {{ignoring return value}}
+  a2(); // expected-warning {{ignoring return value of type 'a' declared with 'warn_unused_result'}}
+  indirectf1(); // expected-warning {{ignoring return value of type 'a' declared with 'warn_unused_result'}}
 }
 [[nodiscard]] typedef void *b1; // expected-warning {{'[[nodiscard]]' attribute ignored when applied to a typedef; consider using '__attribute__((warn_unused_result))' or '[[clang::warn_unused_result]]' instead}}
 [[gnu::warn_unused_result]] typedef void *b2; // expected-warning {{'[[gnu::warn_unused_result]]' attribute ignored when applied to a typedef; consider using '__attribute__((warn_unused_result))' or '[[clang::warn_unused_result]]' instead}}
@@ -279,10 +279,79 @@ void bf2() {
 __attribute__((warn_unused_result)) typedef void *c;
 c cf1();
 void cf2() {
-  cf1(); // expected-warning {{ignoring return value}}
+  cf1(); // expected-warning {{ignoring return value of type 'c' declared with 'warn_unused_result'}}
   void *(*c1)();
   c1();
   c (*c2)();
-  c2(); // expected-warning {{ignoring return value}}
+  c2(); // expected-warning {{ignoring return value of type 'c' declared with 'warn_unused_result'}}
 }
 }
+
+namespace nodiscard_specialization {
+// Test to only mark a specialization of class template as nodiscard
+template<typename T> struct S { S(int) {} };
+template<> struct [[nodiscard]] S<int> { S(int) {} };
+template<typename T> struct [[clang::warn_unused_result]] S<const T> { S(int) {} };
+
+template<typename T>
+S<T> obtain(const T&) { return {2}; }
+
+template<typename T>
+[[nodiscard]] S<T> obtain2(const T&) { return {2}; }
+
+template<typename T>
+__attribute__((warn_unused_result)) S<T> obtain3(const T&) { return {2}; }
+
+void use() {
+  obtain(1.0);             // no warning
+  obtain(1);               // expected-warning {{ignoring return value of type 'S<int>' declared with 'nodiscard'}}
+  obtain<const double>(1); // expected-warning {{ignoring return value of type 'S<const double>' declared with 'warn_unused_result'}}
+
+  S<double>(2);     // no warning
+  S<int>(2);        // expected-warning {{ignoring temporary of type 'S<int>' declared with 'nodiscard'}}
+  S<const char>(2); // no warning (warn_unused_result does not diagnose constructor temporaries)
+
+  // function should take precedence over type
+  obtain2(1.0);             // expected-warning {{ignoring return value of function declared with 'nodiscard'}}
+  obtain2(1);               // expected-warning {{ignoring return value of function declared with 'nodiscard'}}
+  obtain2<const double>(1); // expected-warning {{ignoring return value of function declared with 'nodiscard'}}
+  obtain3(1.0);             // expected-warning {{ignoring return value of function declared with 'warn_unused_result'}}
+  obtain3(1);               // expected-warning {{ignoring return value of function declared with 'warn_unused_result'}}
+  obtain3<const double>(1); // expected-warning {{ignoring return value of function declared with 'warn_unused_result'}}
+}
+
+// Test on constructor nodiscard
+struct H {
+  explicit H(int) {}
+  [[nodiscard]] explicit H(double) {}
+  __attribute__((warn_unused_result)) H(const char*) {}
+};
+
+struct [[nodiscard]] G {
+  explicit G(int) {}
+  [[nodiscard]] explicit G(double) {}
+  [[clang::warn_unused_result]] G(const char*) {}
+};
+
+void use2() {
+  H{2};       // no warning
+  H(2.0);     // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard'}}
+  H("Hello"); // no warning (warn_unused_result does not diagnose constructor temporaries)
+
+  // no warning for explicit cast to void
+  (void)H(2);
+  (void)H{2.0};
+  (void)H{"Hello"};
+
+  // warns for all these invocations
+  // here, constructor/function should take precedence over type
+  G{2};       // expected-warning {{ignoring temporary of type 'G' declared with 'nodiscard'}}
+  G(2.0);     // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard'}}
+  G("Hello"); // expected-warning {{ignoring temporary created by a constructor declared with 'warn_unused_result'}}
+
+  // no warning for explicit cast to void
+  (void)G(2);
+  (void)G{2.0};
+  (void)G{"Hello"};
+}
+} // namespace nodiscard_specialization
diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
index 76b5d01b8036e..3c2ea557b1982 100644
--- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
@@ -14,6 +14,6 @@ RWBuffer<> BufferErr2;
 
 [numthreads(1,1,1)]
 void main() {
-  (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::RWBuffer<vector<float, 3>>'}}
+  (void)Buffer.__handle; // expected-error {{'__handle' is a private member of 'hlsl::RWBuffer<vector<float, 3>>'}}
   // expected-note@* {{implicitly declared private here}}
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
index a472d5519dc51..b0cf9453cecfc 100644
--- a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
@@ -14,6 +14,6 @@ StructuredBuffer<> BufferErr2;
 
 [numthreads(1,1,1)]
 void main() {
-  (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer<vector<float, 3>>'}}
+  (void)Buffer.__handle; // expected-error {{'__handle' is a private member of 'hlsl::StructuredBuffer<vector<float, 3>>'}}
   // expected-note@* {{implicitly declared private here}}
 }
diff --git a/clang/test/SemaObjC/comptypes-legal.m b/clang/test/SemaObjC/comptypes-legal.m
index 09c3a7261bd58..8e332f42be842 100644
--- a/clang/test/SemaObjC/comptypes-legal.m
+++ b/clang/test/SemaObjC/comptypes-legal.m
@@ -41,7 +41,7 @@ @interface I
 - (void) Meth : (id <NSCopying>)aKey; // expected-note {{passing argument to parameter 'aKey' here}}
 @end
 
-@class ForwarClass; // expected-note 3 {{conformance of forward class 'ForwarClass' to protocol 'NSCopying' can not be confirmed}}
+@class ForwarClass; // expected-note 3 {{conformance of forward class 'ForwarClass' to protocol 'NSCopying' cannot be confirmed}}
 
 ForwarClass *Test10751015 (I* pi, ForwarClass *ns_forward) {
 
diff --git a/clang/test/SemaObjC/method-param-named-id.m b/clang/test/SemaObjC/method-param-named-id.m
new file mode 100644
index 0000000000000..8269c31116c32
--- /dev/null
+++ b/clang/test/SemaObjC/method-param-named-id.m
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -Wno-objc-root-class %s
+
+
+@interface Foo
+-(void)paramNamedID:(int)id usesIDType:(id)notShadowed;
+-(void)paramNamedID:(int)id, id notShadowed; // expected-warning{{use of C-style parameters in Objective-C method declarations is deprecated}}
+@end
diff --git a/clang/test/SemaOpenCL/access-qualifier.cl b/clang/test/SemaOpenCL/access-qualifier.cl
index 726253c0b1a23..d1c9b5e35af6c 100644
--- a/clang/test/SemaOpenCL/access-qualifier.cl
+++ b/clang/test/SemaOpenCL/access-qualifier.cl
@@ -36,7 +36,7 @@ void myRead(read_only image1d_t);
 #if (__OPENCL_C_VERSION__ == 200) || ((__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300) && defined(__opencl_c_read_write_images))
 void myReadWrite(read_write image1d_t);
 #else
-void myReadWrite(read_write image1d_t); // expected-error {{access qualifier 'read_write' can not be used for '__read_write image1d_t' prior to OpenCL C version 2.0 or in version 3.0 and without __opencl_c_read_write_images feature}}
+void myReadWrite(read_write image1d_t); // expected-error {{access qualifier 'read_write' cannot be used for '__read_write image1d_t' prior to OpenCL C version 2.0 or in version 3.0 and without __opencl_c_read_write_images feature}}
 #endif
 
 
@@ -94,9 +94,9 @@ kernel void k11(read_only write_only image1d_t i){} // expected-error{{multiple
 kernel void k12(read_only read_only image1d_t i){} // expected-warning {{duplicate 'read_only' declaration specifier}}
 
 #if (__OPENCL_C_VERSION__ == 200) || ((__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300) && defined(__opencl_c_read_write_images))
-kernel void k13(read_write pipe int i){} // expected-error{{access qualifier 'read_write' can not be used for 'read_only pipe int'}}
+kernel void k13(read_write pipe int i){} // expected-error{{access qualifier 'read_write' cannot be used for 'read_only pipe int'}}
 #else
-kernel void k13(__read_write image1d_t i){} // expected-error{{access qualifier '__read_write' can not be used for '__read_write image1d_t' prior to OpenCL C version 2.0 or in version 3.0 and without __opencl_c_read_write_images feature}}
+kernel void k13(__read_write image1d_t i){} // expected-error{{access qualifier '__read_write' cannot be used for '__read_write image1d_t' prior to OpenCL C version 2.0 or in version 3.0 and without __opencl_c_read_write_images feature}}
 #endif
 
 #if defined(__OPENCL_C_VERSION__) && __OPENCL_C_VERSION__ < 200
@@ -116,7 +116,7 @@ kernel void k14(read_only pipe int p) {
 
 kernel void pipe_ro_twice(read_only read_only pipe int i){} // expected-warning{{duplicate 'read_only' declaration specifier}}
 // Conflicting access qualifiers
-kernel void pipe_ro_twice_tw(read_write read_only read_only pipe int i){} // expected-error{{access qualifier 'read_write' can not be used for 'read_only pipe int'}}
+kernel void pipe_ro_twice_tw(read_write read_only read_only pipe int i){} // expected-error{{access qualifier 'read_write' cannot be used for 'read_only pipe int'}}
 kernel void pipe_ro_wo(read_only write_only pipe int i){} // expected-error{{multiple access qualifiers}}
 
 typedef read_only pipe int ROPipeInt;
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
new file mode 100644
index 0000000000000..4af67763c40dd
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -0,0 +1,28 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx950 -verify -S -o - %s
+
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float16 __attribute__((ext_vector_type(16)));
+typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
+
+
+void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
+
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_f16' must be a constant integer}}
+}
+
+
+void test_mfma_f32_32x32x16_f16(__global float16* out, half8 a, half8 b, float16 c, int X) {
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
+}
+
+void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, float16 c, int X) {
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0);  // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X);  // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
new file mode 100644
index 0000000000000..e0fd2aa5c58a0
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -0,0 +1,15 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s
+
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float16 __attribute__((ext_vector_type(16)));
+typedef half half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
+
+void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
+          __global float16* out1, half8 a1, half8 b1, float16 c1,
+          __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) {
+  *out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
+  *out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
+  *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
+}
diff --git a/clang/tools/amdgpu-arch/AMDGPUArch.cpp b/clang/tools/amdgpu-arch/AMDGPUArch.cpp
index 7ae57b7877e1f..6c10cbc5c46a8 100644
--- a/clang/tools/amdgpu-arch/AMDGPUArch.cpp
+++ b/clang/tools/amdgpu-arch/AMDGPUArch.cpp
@@ -25,7 +25,7 @@ static void PrintVersion(raw_ostream &OS) {
   OS << clang::getClangToolFullVersion("amdgpu-arch") << '\n';
 }
 
-int printGPUsByHSA();
+int printGPUsByKFD();
 int printGPUsByHIP();
 
 int main(int argc, char *argv[]) {
@@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
   }
 
 #ifndef _WIN32
-  if (!printGPUsByHSA())
+  if (!printGPUsByKFD())
     return 0;
 #endif
 
diff --git a/clang/tools/amdgpu-arch/AMDGPUArchByHSA.cpp b/clang/tools/amdgpu-arch/AMDGPUArchByHSA.cpp
deleted file mode 100644
index 432f2c414ed24..0000000000000
--- a/clang/tools/amdgpu-arch/AMDGPUArchByHSA.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===- AMDGPUArchByHSA.cpp - list AMDGPU installed ------*- C++ -*---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a tool for detecting name of AMDGPU installed in system
-// using HSA on Linux. This tool is used by AMDGPU OpenMP and HIP driver.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Basic/Version.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-using namespace llvm;
-
-typedef enum {
-  HSA_STATUS_SUCCESS = 0x0,
-} hsa_status_t;
-
-typedef enum {
-  HSA_DEVICE_TYPE_CPU = 0,
-  HSA_DEVICE_TYPE_GPU = 1,
-} hsa_device_type_t;
-
-typedef enum {
-  HSA_AGENT_INFO_NAME = 0,
-  HSA_AGENT_INFO_DEVICE = 17,
-} hsa_agent_info_t;
-
-typedef struct hsa_agent_s {
-  uint64_t handle;
-} hsa_agent_t;
-
-hsa_status_t (*hsa_init)();
-hsa_status_t (*hsa_shut_down)();
-hsa_status_t (*hsa_agent_get_info)(hsa_agent_t, hsa_agent_info_t, void *);
-hsa_status_t (*hsa_iterate_agents)(hsa_status_t (*)(hsa_agent_t, void *),
-                                   void *);
-
-constexpr const char *DynamicHSAPath = "libhsa-runtime64.so";
-
-llvm::Error loadHSA() {
-  std::string ErrMsg;
-  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
-      llvm::sys::DynamicLibrary::getPermanentLibrary(DynamicHSAPath, &ErrMsg));
-  if (!DynlibHandle->isValid()) {
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "Failed to 'dlopen' %s", DynamicHSAPath);
-  }
-#define DYNAMIC_INIT(SYMBOL)                                                   \
-  {                                                                            \
-    void *SymbolPtr = DynlibHandle->getAddressOfSymbol(#SYMBOL);               \
-    if (!SymbolPtr)                                                            \
-      return llvm::createStringError(llvm::inconvertibleErrorCode(),           \
-                                     "Failed to 'dlsym' " #SYMBOL);            \
-    SYMBOL = reinterpret_cast<decltype(SYMBOL)>(SymbolPtr);                    \
-  }
-  DYNAMIC_INIT(hsa_init);
-  DYNAMIC_INIT(hsa_shut_down);
-  DYNAMIC_INIT(hsa_agent_get_info);
-  DYNAMIC_INIT(hsa_iterate_agents);
-#undef DYNAMIC_INIT
-  return llvm::Error::success();
-}
-
-static hsa_status_t iterateAgentsCallback(hsa_agent_t Agent, void *Data) {
-  hsa_device_type_t DeviceType;
-  hsa_status_t Status =
-      hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
-
-  // continue only if device type if GPU
-  if (Status != HSA_STATUS_SUCCESS || DeviceType != HSA_DEVICE_TYPE_GPU) {
-    return Status;
-  }
-
-  std::vector<std::string> *GPUs =
-      static_cast<std::vector<std::string> *>(Data);
-  char GPUName[64];
-  Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, GPUName);
-  if (Status != HSA_STATUS_SUCCESS) {
-    return Status;
-  }
-  GPUs->push_back(GPUName);
-  return HSA_STATUS_SUCCESS;
-}
-
-int printGPUsByHSA() {
-  // Attempt to load the HSA runtime.
-  if (llvm::Error Err = loadHSA()) {
-    logAllUnhandledErrors(std::move(Err), llvm::errs());
-    return 1;
-  }
-
-  hsa_status_t Status = hsa_init();
-  if (Status != HSA_STATUS_SUCCESS) {
-    return 1;
-  }
-
-  std::vector<std::string> GPUs;
-  Status = hsa_iterate_agents(iterateAgentsCallback, &GPUs);
-  if (Status != HSA_STATUS_SUCCESS) {
-    return 1;
-  }
-
-  for (const auto &GPU : GPUs)
-    llvm::outs() << GPU << '\n';
-
-  if (GPUs.size() < 1)
-    return 1;
-
-  hsa_shut_down();
-  return 0;
-}
diff --git a/clang/tools/amdgpu-arch/AMDGPUArchByKFD.cpp b/clang/tools/amdgpu-arch/AMDGPUArchByKFD.cpp
new file mode 100644
index 0000000000000..94ebf9073e00e
--- /dev/null
+++ b/clang/tools/amdgpu-arch/AMDGPUArchByKFD.cpp
@@ -0,0 +1,77 @@
+//===- AMDGPUArchByKFD.cpp - list AMDGPU installed ------*- C++ -*---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a tool for detecting name of AMD GPUs installed in
+// system using the Linux sysfs interface for the AMD KFD driver. This file does
+// not respect ROCR_VISIBLE_DEVICES like the ROCm environment would.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include <memory>
+
+using namespace llvm;
+
+constexpr static const char *KFD_SYSFS_NODE_PATH =
+    "/sys/devices/virtual/kfd/kfd/topology/nodes";
+
+// See the ROCm implementation for how this is handled.
+// https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/src/libhsakmt.h#L126
+constexpr static long getMajor(long Ver) { return (Ver / 10000) % 100; }
+constexpr static long getMinor(long Ver) { return (Ver / 100) % 100; }
+constexpr static long getStep(long Ver) { return Ver % 100; }
+
+int printGPUsByKFD() {
+  SmallVector<std::pair<long, long>> Devices;
+  std::error_code EC;
+  for (sys::fs::directory_iterator Begin(KFD_SYSFS_NODE_PATH, EC), End;
+       Begin != End; Begin.increment(EC)) {
+    if (EC)
+      return 1;
+
+    long Node = 0;
+    if (sys::path::stem(Begin->path()).consumeInteger(10, Node))
+      return 1;
+
+    SmallString<0> Path(Begin->path());
+    sys::path::append(Path, "properties");
+
+    ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+        MemoryBuffer::getFileOrSTDIN(Path);
+    if (std::error_code EC = BufferOrErr.getError())
+      return 1;
+
+    long GFXVersion = 0;
+    for (line_iterator Lines(**BufferOrErr, false); !Lines.is_at_end();
+         ++Lines) {
+      StringRef Line(*Lines);
+      if (Line.consume_front("gfx_target_version")) {
+        if (Line.drop_while([](char C) { return std::isspace(C); })
+                .consumeInteger(10, GFXVersion))
+          return 1;
+        break;
+      }
+    }
+
+    // If this is zero the node is a CPU.
+    if (GFXVersion == 0)
+      continue;
+    Devices.emplace_back(Node, GFXVersion);
+  }
+
+  // Sort the devices by their node to make sure it prints in order.
+  llvm::sort(Devices, [](auto &L, auto &R) { return L.first < R.first; });
+  for (const auto &[Node, GFXVersion] : Devices)
+    std::fprintf(stdout, "gfx%ld%ld%lx\n", getMajor(GFXVersion),
+                 getMinor(GFXVersion), getStep(GFXVersion));
+
+  return 0;
+}
diff --git a/clang/tools/amdgpu-arch/CMakeLists.txt b/clang/tools/amdgpu-arch/CMakeLists.txt
index 1657c70125130..c4c8de614565a 100644
--- a/clang/tools/amdgpu-arch/CMakeLists.txt
+++ b/clang/tools/amdgpu-arch/CMakeLists.txt
@@ -8,6 +8,6 @@
 
 set(LLVM_LINK_COMPONENTS Support)
 
-add_clang_tool(amdgpu-arch AMDGPUArch.cpp AMDGPUArchByHSA.cpp AMDGPUArchByHIP.cpp)
+add_clang_tool(amdgpu-arch AMDGPUArch.cpp AMDGPUArchByKFD.cpp AMDGPUArchByHIP.cpp)
 
 target_link_libraries(amdgpu-arch PRIVATE clangBasic)
diff --git a/clang/tools/clang-shlib/CMakeLists.txt b/clang/tools/clang-shlib/CMakeLists.txt
index 298d3a9d18fec..d83c13fd394f4 100644
--- a/clang/tools/clang-shlib/CMakeLists.txt
+++ b/clang/tools/clang-shlib/CMakeLists.txt
@@ -48,6 +48,14 @@ add_clang_library(clang-cpp
                   ${_OBJECTS}
                   LINK_LIBS
                   ${_DEPS})
+
+configure_file(simple_version_script.map.in simple_version_script.map)
+
+if (NOT APPLE AND NOT MSVC AND NOT MINGW AND NOT LLVM_LINKER_IS_SOLARISLD)
+  # Solaris ld does not accept global: *; so there is no way to version *all* global symbols
+  target_link_options(clang-cpp PRIVATE LINKER:--version-script,${CMAKE_CURRENT_BINARY_DIR}/simple_version_script.map)
+endif()
+
 # Optimize function calls for default visibility definitions to avoid PLT and
 # reduce dynamic relocations.
 if (NOT APPLE AND NOT MINGW AND NOT LLVM_LINKER_IS_SOLARISLD_ILLUMOS)
diff --git a/clang/tools/clang-shlib/simple_version_script.map.in b/clang/tools/clang-shlib/simple_version_script.map.in
new file mode 100644
index 0000000000000..cb2306d1f5968
--- /dev/null
+++ b/clang/tools/clang-shlib/simple_version_script.map.in
@@ -0,0 +1 @@
+@LLVM_SHLIB_SYMBOL_VERSION@ { global: *; };
diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp
index 49d276fc78d81..e4a14ff754d1a 100644
--- a/clang/unittests/Format/FormatTestVerilog.cpp
+++ b/clang/unittests/Format/FormatTestVerilog.cpp
@@ -702,6 +702,18 @@ TEST_F(FormatTestVerilog, Hierarchy) {
                "  generate\n"
                "  endgenerate\n"
                "endfunction : x");
+  // Type names with '::' should be recognized.
+  verifyFormat("function automatic x::x x\n"
+               "    (input x);\n"
+               "endfunction : x");
+  // Names having to do macros should be recognized.
+  verifyFormat("function automatic x::x x``x\n"
+               "    (input x);\n"
+               "endfunction : x");
+  verifyFormat("function automatic x::x `x\n"
+               "    (input x);\n"
+               "endfunction : x");
+  verifyNoCrash("x x(x x, x x);");
 }
 
 TEST_F(FormatTestVerilog, Identifiers) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index bb8ee416ea2db..e1ae1770e8ebe 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2598,6 +2598,20 @@ TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) {
   Tokens = Annotate("x = '{\"\"};");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::string_literal, TT_Unknown);
+
+  // Module headers.
+  Tokens = Annotate("module x();\nendmodule");
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_VerilogMultiLineListLParen);
+  Tokens = Annotate("function automatic `x x();\nendmodule");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::l_paren, TT_VerilogMultiLineListLParen);
+  Tokens = Annotate("function automatic x``x x();\nendmodule");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_VerilogMultiLineListLParen);
+  Tokens = Annotate("function automatic x::x x();\nendmodule");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_VerilogMultiLineListLParen);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index def11f88c4854..74e5e01b66c54 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -48,6 +48,8 @@ _Bool __aarch64_have_lse_atomics
 #elif defined(__linux__) && __has_include(<sys/auxv.h>)
 #include "aarch64/hwcap.inc"
 #include "aarch64/lse_atomics/getauxval.inc"
+#elif defined(_WIN32)
+#include "aarch64/lse_atomics/windows.inc"
 #else
 // When unimplemented, we leave __aarch64_have_lse_atomics initialized to false.
 #endif
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/windows.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/windows.inc
new file mode 100644
index 0000000000000..fff1593e1fac3
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/windows.inc
@@ -0,0 +1,12 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <processthreadsapi.h>
+
+#ifndef PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE 34
+#endif
+
+static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) {
+  if (IsProcessorFeaturePresent(PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE))
+    __aarch64_have_lse_atomics = true;
+}
diff --git a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
index d92b510521942..e8011014c2331 100644
--- a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
+++ b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
@@ -200,6 +200,9 @@
 #undef SANITIZER_INTERCEPT_CLOCK_GETCPUCLOCKID
 #define SANITIZER_INTERCEPT_CLOCK_GETCPUCLOCKID 0
 
+#undef SANITIZER_INTERCEPT_TIMER_CREATE
+#define SANITIZER_INTERCEPT_TIMER_CREATE 0
+
 #undef SANITIZER_INTERCEPT_GETITIMER
 #define SANITIZER_INTERCEPT_GETITIMER 0
 
diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp
index 5c44c000ae577..7ab9e4ff2ac9f 100644
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@@ -569,7 +569,7 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
     PtraceRegistersStatus have_registers =
         suspended_threads.GetRegistersAndSP(i, &registers, &sp);
     if (have_registers != REGISTERS_AVAILABLE) {
-      Report("Unable to get registers from thread %llu.\n", os_id);
+      VReport(1, "Unable to get registers from thread %llu.\n", os_id);
       // If unable to get SP, consider the entire stack to be reachable unless
       // GetRegistersAndSP failed with ESRCH.
       if (have_registers == REGISTERS_UNAVAILABLE_FATAL)
diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp
index 41b99fabe84f4..a126dd4fdd55e 100644
--- a/compiler-rt/lib/msan/tests/msan_test.cpp
+++ b/compiler-rt/lib/msan/tests/msan_test.cpp
@@ -4881,4 +4881,32 @@ TEST(MemorySanitizer, throw_catch) {
     // pass
   }
 }
+
+#if defined(__GLIBC__)
+TEST(MemorySanitizer, timer_create) {
+  timer_t timer;
+  EXPECT_POISONED(timer);
+  int res = timer_create(CLOCK_REALTIME, nullptr, &timer);
+  ASSERT_EQ(0, res);
+  EXPECT_NOT_POISONED(timer);
+
+  // Make sure the timer is usable.
+  struct itimerspec cur_value {};
+  cur_value.it_value.tv_sec = 1;
+  EXPECT_EQ(0, timer_settime(timer, 0, &cur_value, nullptr));
+
+  struct itimerspec read_value;
+  EXPECT_POISONED(read_value);
+  EXPECT_EQ(0, timer_gettime(timer, &read_value));
+  EXPECT_NOT_POISONED(read_value);
+
+  timer_t timer2;
+  EXPECT_POISONED(timer2);
+  // Use an invalid clock_id to make timer_create fail.
+  res = timer_create(INT_MAX, nullptr, &timer2);
+  ASSERT_EQ(-1, res);
+  EXPECT_POISONED(timer2);
+  timer_delete(timer);
+}
+#endif
 } // namespace
diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp
index e9f42d3760aa8..70edcc546219f 100644
--- a/compiler-rt/lib/rtsan/rtsan.cpp
+++ b/compiler-rt/lib/rtsan/rtsan.cpp
@@ -62,8 +62,12 @@ static void OnViolation(const BufferedStackTrace &stack,
   if (UNLIKELY(is_stack_novel)) {
     IncrementUniqueErrorCount();
 
-    PrintDiagnostics(info);
-    stack.Print();
+    {
+      ScopedErrorReportLock l;
+      PrintDiagnostics(info);
+      stack.Print();
+      PrintErrorSummary(info, stack);
+    }
 
     handle.inc_use_count_unsafe();
   }
diff --git a/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp b/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
index ecba30d2ab8df..f13d3db101d48 100644
--- a/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
@@ -39,20 +39,22 @@ class Decorator : public SanitizerCommonDecorator {
 };
 } // namespace
 
+static const char *GetErrorTypeStr(const DiagnosticsInfo &info) {
+  switch (info.type) {
+  case DiagnosticsInfoType::InterceptedCall:
+    return "unsafe-library-call";
+  case DiagnosticsInfoType::BlockingCall:
+    return "blocking-call";
+  }
+  CHECK(false);
+  return "(unknown error)";
+}
+
 static void PrintError(const Decorator &decorator,
                        const DiagnosticsInfo &info) {
-  const auto ErrorTypeStr = [&info]() -> const char * {
-    switch (info.type) {
-    case DiagnosticsInfoType::InterceptedCall:
-      return "unsafe-library-call";
-    case DiagnosticsInfoType::BlockingCall:
-      return "blocking-call";
-    }
-    return "(unknown error)";
-  };
 
   Printf("%s", decorator.Error());
-  Report("ERROR: RealtimeSanitizer: %s\n", ErrorTypeStr());
+  Report("ERROR: RealtimeSanitizer: %s\n", GetErrorTypeStr(info));
 }
 
 static void PrintReason(const Decorator &decorator,
@@ -78,10 +80,16 @@ static void PrintReason(const Decorator &decorator,
 }
 
 void __rtsan::PrintDiagnostics(const DiagnosticsInfo &info) {
-  ScopedErrorReportLock l;
+  ScopedErrorReportLock::CheckLocked();
 
   Decorator d;
   PrintError(d, info);
   PrintReason(d, info);
   Printf("%s", d.Default());
 }
+
+void __rtsan::PrintErrorSummary(const DiagnosticsInfo &info,
+                                const BufferedStackTrace &stack) {
+  ScopedErrorReportLock::CheckLocked();
+  ReportErrorSummary(GetErrorTypeStr(info), &stack);
+}
diff --git a/compiler-rt/lib/rtsan/rtsan_diagnostics.h b/compiler-rt/lib/rtsan/rtsan_diagnostics.h
index f8a6b8a954a24..1138e61eb5df4 100644
--- a/compiler-rt/lib/rtsan/rtsan_diagnostics.h
+++ b/compiler-rt/lib/rtsan/rtsan_diagnostics.h
@@ -30,4 +30,6 @@ struct DiagnosticsInfo {
 };
 
 void PrintDiagnostics(const DiagnosticsInfo &info);
+void PrintErrorSummary(const DiagnosticsInfo &info,
+                       const __sanitizer::BufferedStackTrace &stack);
 } // namespace __rtsan
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 3a1b1f6524745..73448cfc11788 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -42,6 +42,7 @@ void OSSpinLockLock(volatile OSSpinLock *__lock);
 #endif
 
 #include <fcntl.h>
+#include <poll.h>
 #include <pthread.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -315,6 +316,31 @@ INTERCEPTOR(ssize_t, writev, int fd, const struct iovec *iov, int iovcnt) {
   return REAL(writev)(fd, iov, iovcnt);
 }
 
+INTERCEPTOR(off_t, lseek, int fd, off_t offset, int whence) {
+  __rtsan_notify_intercepted_call("lseek");
+  return REAL(lseek)(fd, offset, whence);
+}
+
+#if SANITIZER_INTERCEPT_LSEEK64
+INTERCEPTOR(off64_t, lseek64, int fd, off64_t offset, int whence) {
+  __rtsan_notify_intercepted_call("lseek64");
+  return REAL(lseek64)(fd, offset, whence);
+}
+#define RTSAN_MAYBE_INTERCEPT_LSEEK64 INTERCEPT_FUNCTION(lseek64)
+#else
+#define RTSAN_MAYBE_INTERCEPT_LSEEK64
+#endif // SANITIZER_INTERCEPT_LSEEK64
+
+INTERCEPTOR(int, dup, int oldfd) {
+  __rtsan_notify_intercepted_call("dup");
+  return REAL(dup)(oldfd);
+}
+
+INTERCEPTOR(int, dup2, int oldfd, int newfd) {
+  __rtsan_notify_intercepted_call("dup2");
+  return REAL(dup2)(oldfd, newfd);
+}
+
 // Concurrency
 #if SANITIZER_APPLE
 #pragma clang diagnostic push
@@ -612,6 +638,114 @@ INTERCEPTOR(int, shutdown, int socket, int how) {
   return REAL(shutdown)(socket, how);
 }
 
+// I/O Multiplexing
+
+INTERCEPTOR(int, poll, struct pollfd *fds, nfds_t nfds, int timeout) {
+  __rtsan_notify_intercepted_call("poll");
+  return REAL(poll)(fds, nfds, timeout);
+}
+
+#if !SANITIZER_APPLE
+// FIXME: This should work on all unix systems, even Mac, but currently
+// it is showing some weird error while linking
+// error: declaration of 'select' has a different language linkage
+INTERCEPTOR(int, select, int nfds, fd_set *readfds, fd_set *writefds,
+            fd_set *exceptfds, struct timeval *timeout) {
+  __rtsan_notify_intercepted_call("select");
+  return REAL(select)(nfds, readfds, writefds, exceptfds, timeout);
+}
+#define RTSAN_MAYBE_INTERCEPT_SELECT INTERCEPT_FUNCTION(select)
+#else
+#define RTSAN_MAYBE_INTERCEPT_SELECT
+#endif // !SANITIZER_APPLE
+
+INTERCEPTOR(int, pselect, int nfds, fd_set *readfds, fd_set *writefds,
+            fd_set *exceptfds, const struct timespec *timeout,
+            const sigset_t *sigmask) {
+  __rtsan_notify_intercepted_call("pselect");
+  return REAL(pselect)(nfds, readfds, writefds, exceptfds, timeout, sigmask);
+}
+
+#if SANITIZER_INTERCEPT_EPOLL
+INTERCEPTOR(int, epoll_create, int size) {
+  __rtsan_notify_intercepted_call("epoll_create");
+  return REAL(epoll_create)(size);
+}
+
+INTERCEPTOR(int, epoll_create1, int flags) {
+  __rtsan_notify_intercepted_call("epoll_create1");
+  return REAL(epoll_create1)(flags);
+}
+
+INTERCEPTOR(int, epoll_ctl, int epfd, int op, int fd,
+            struct epoll_event *event) {
+  __rtsan_notify_intercepted_call("epoll_ctl");
+  return REAL(epoll_ctl)(epfd, op, fd, event);
+}
+
+INTERCEPTOR(int, epoll_wait, int epfd, struct epoll_event *events,
+            int maxevents, int timeout) {
+  __rtsan_notify_intercepted_call("epoll_wait");
+  return REAL(epoll_wait)(epfd, events, maxevents, timeout);
+}
+
+INTERCEPTOR(int, epoll_pwait, int epfd, struct epoll_event *events,
+            int maxevents, int timeout, const sigset_t *sigmask) {
+  __rtsan_notify_intercepted_call("epoll_pwait");
+  return REAL(epoll_pwait)(epfd, events, maxevents, timeout, sigmask);
+}
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE INTERCEPT_FUNCTION(epoll_create)
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE1 INTERCEPT_FUNCTION(epoll_create1)
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_CTL INTERCEPT_FUNCTION(epoll_ctl)
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_WAIT INTERCEPT_FUNCTION(epoll_wait)
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_PWAIT INTERCEPT_FUNCTION(epoll_pwait)
+#else
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE1
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_CTL
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_WAIT
+#define RTSAN_MAYBE_INTERCEPT_EPOLL_PWAIT
+#endif // SANITIZER_INTERCEPT_EPOLL
+
+#if SANITIZER_INTERCEPT_KQUEUE
+INTERCEPTOR(int, kqueue, void) {
+  __rtsan_notify_intercepted_call("kqueue");
+  return REAL(kqueue)();
+}
+
+INTERCEPTOR(int, kevent, int kq, const struct kevent *changelist, int nchanges,
+            struct kevent *eventlist, int nevents,
+            const struct timespec *timeout) {
+  __rtsan_notify_intercepted_call("kevent");
+  return REAL(kevent)(kq, changelist, nchanges, eventlist, nevents, timeout);
+}
+
+INTERCEPTOR(int, kevent64, int kq, const struct kevent64_s *changelist,
+            int nchanges, struct kevent64_s *eventlist, int nevents,
+            unsigned int flags, const struct timespec *timeout) {
+  __rtsan_notify_intercepted_call("kevent64");
+  return REAL(kevent64)(kq, changelist, nchanges, eventlist, nevents, flags,
+                        timeout);
+}
+#define RTSAN_MAYBE_INTERCEPT_KQUEUE INTERCEPT_FUNCTION(kqueue)
+#define RTSAN_MAYBE_INTERCEPT_KEVENT INTERCEPT_FUNCTION(kevent)
+#define RTSAN_MAYBE_INTERCEPT_KEVENT64 INTERCEPT_FUNCTION(kevent64)
+#else
+#define RTSAN_MAYBE_INTERCEPT_KQUEUE
+#define RTSAN_MAYBE_INTERCEPT_KEVENT
+#define RTSAN_MAYBE_INTERCEPT_KEVENT64
+#endif // SANITIZER_INTERCEPT_KQUEUE
+
+INTERCEPTOR(int, pipe, int pipefd[2]) {
+  __rtsan_notify_intercepted_call("pipe");
+  return REAL(pipe)(pipefd);
+}
+
+INTERCEPTOR(int, mkfifo, const char *pathname, mode_t mode) {
+  __rtsan_notify_intercepted_call("mkfifo");
+  return REAL(mkfifo)(pathname, mode);
+}
+
 // Preinit
 void __rtsan::InitializeInterceptors() {
   INTERCEPT_FUNCTION(calloc);
@@ -658,6 +792,10 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_CREAT64;
   INTERCEPT_FUNCTION(puts);
   INTERCEPT_FUNCTION(fputs);
+  INTERCEPT_FUNCTION(lseek);
+  RTSAN_MAYBE_INTERCEPT_LSEEK64;
+  INTERCEPT_FUNCTION(dup);
+  INTERCEPT_FUNCTION(dup2);
 
 #if SANITIZER_APPLE
   INTERCEPT_FUNCTION(OSSpinLockLock);
@@ -696,6 +834,21 @@ void __rtsan::InitializeInterceptors() {
   INTERCEPT_FUNCTION(sendto);
   INTERCEPT_FUNCTION(shutdown);
   INTERCEPT_FUNCTION(socket);
+
+  RTSAN_MAYBE_INTERCEPT_SELECT;
+  INTERCEPT_FUNCTION(pselect);
+  INTERCEPT_FUNCTION(poll);
+  RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE;
+  RTSAN_MAYBE_INTERCEPT_EPOLL_CREATE1;
+  RTSAN_MAYBE_INTERCEPT_EPOLL_CTL;
+  RTSAN_MAYBE_INTERCEPT_EPOLL_WAIT;
+  RTSAN_MAYBE_INTERCEPT_EPOLL_PWAIT;
+  RTSAN_MAYBE_INTERCEPT_KQUEUE;
+  RTSAN_MAYBE_INTERCEPT_KEVENT;
+  RTSAN_MAYBE_INTERCEPT_KEVENT64;
+
+  INTERCEPT_FUNCTION(pipe);
+  INTERCEPT_FUNCTION(mkfifo);
 }
 
 #endif // SANITIZER_POSIX
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index d0ae12c9bea44..3e14346f33c7c 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -28,8 +28,18 @@
 #include <malloc.h>
 #endif
 
+#if SANITIZER_INTERCEPT_EPOLL
+#include <sys/epoll.h>
+#endif
+
+#if SANITIZER_INTERCEPT_KQUEUE
+#include <sys/event.h>
+#include <sys/time.h>
+#endif
+
 #include <fcntl.h>
 #include <netdb.h>
+#include <poll.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <sys/mman.h>
@@ -357,6 +367,24 @@ class RtsanOpenedFileTest : public RtsanFileTest {
   int fd = -1;
 };
 
+TEST_F(RtsanOpenedFileTest, LseekDiesWhenRealtime) {
+  auto Func = [this]() { lseek(GetOpenFd(), 0, SEEK_SET); };
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("lseek"));
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, DupDiesWhenRealtime) {
+  auto Func = [this]() { dup(GetOpenFd()); };
+  ExpectRealtimeDeath(Func, "dup");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, Dup2DiesWhenRealtime) {
+  auto Func = [this]() { dup2(GetOpenFd(), 0); };
+  ExpectRealtimeDeath(Func, "dup2");
+  ExpectNonRealtimeSurvival(Func);
+}
+
 TEST_F(RtsanOpenedFileTest, FreadDiesWhenRealtime) {
   auto Func = [this]() {
     char c{};
@@ -779,4 +807,175 @@ TEST(TestRtsanInterceptors, ShutdownOnASocketDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
+/*
+    I/O Multiplexing
+*/
+
+TEST(TestRtsanInterceptors, PollDiesWhenRealtime) {
+  struct pollfd fds[1];
+  fds[0].fd = 0;
+  fds[0].events = POLLIN;
+
+  auto Func = [&fds]() { poll(fds, 1, 0); };
+
+  ExpectRealtimeDeath(Func, "poll");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+#if !SANITIZER_APPLE
+// FIXME: This should work on Darwin as well
+// see the comment near the interceptor
+TEST(TestRtsanInterceptors, SelectDiesWhenRealtime) {
+  fd_set readfds;
+  FD_ZERO(&readfds);
+  FD_SET(0, &readfds);
+  struct timeval timeout = {0, 0};
+
+  auto Func = [&readfds, &timeout]() {
+    select(1, &readfds, nullptr, nullptr, &timeout);
+  };
+  ExpectRealtimeDeath(Func, "select");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
+TEST(TestRtsanInterceptors, PSelectDiesWhenRealtime) {
+  fd_set readfds;
+  FD_ZERO(&readfds);
+  FD_SET(0, &readfds);
+  struct timespec timeout = {0, 0};
+
+  auto Func = [&]() {
+    pselect(1, &readfds, nullptr, nullptr, &timeout, nullptr);
+  };
+  ExpectRealtimeDeath(Func, "pselect");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+#if SANITIZER_INTERCEPT_EPOLL
+TEST(TestRtsanInterceptors, EpollCreateDiesWhenRealtime) {
+  auto Func = []() { epoll_create(1); };
+  ExpectRealtimeDeath(Func, "epoll_create");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST(TestRtsanInterceptors, EpollCreate1DiesWhenRealtime) {
+  auto Func = []() { epoll_create1(EPOLL_CLOEXEC); };
+  ExpectRealtimeDeath(Func, "epoll_create1");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+class EpollTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    epfd = epoll_create1(EPOLL_CLOEXEC);
+    ASSERT_GE(epfd, 0);
+  }
+
+  void TearDown() override {
+    if (epfd >= 0)
+      close(epfd);
+  }
+
+  int GetEpollFd() { return epfd; }
+
+private:
+  int epfd = -1;
+};
+
+TEST_F(EpollTest, EpollCtlDiesWhenRealtime) {
+  auto Func = [this]() {
+    struct epoll_event event = {.events = EPOLLIN, .data = {.fd = 0}};
+    epoll_ctl(GetEpollFd(), EPOLL_CTL_ADD, 0, &event);
+  };
+  ExpectRealtimeDeath(Func, "epoll_ctl");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(EpollTest, EpollWaitDiesWhenRealtime) {
+  auto Func = [this]() {
+    struct epoll_event events[1];
+    epoll_wait(GetEpollFd(), events, 1, 0);
+  };
+
+  ExpectRealtimeDeath(Func, "epoll_wait");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(EpollTest, EpollPWaitDiesWhenRealtime) {
+  auto Func = [this]() {
+    struct epoll_event events[1];
+    epoll_pwait(GetEpollFd(), events, 1, 0, nullptr);
+  };
+
+  ExpectRealtimeDeath(Func, "epoll_pwait");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif // SANITIZER_INTERCEPT_EPOLL
+
+#if SANITIZER_INTERCEPT_KQUEUE
+TEST(TestRtsanInterceptors, KqueueDiesWhenRealtime) {
+  auto Func = []() { kqueue(); };
+  ExpectRealtimeDeath(Func, "kqueue");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+class KqueueTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    kq = kqueue();
+    ASSERT_GE(kq, 0);
+  }
+
+  void TearDown() override {
+    if (kq >= 0)
+      close(kq);
+  }
+
+  int GetKqueueFd() { return kq; }
+
+private:
+  int kq = -1;
+};
+
+TEST_F(KqueueTest, KeventDiesWhenRealtime) {
+  struct kevent event;
+  EV_SET(&event, 0, EVFILT_READ, EV_ADD, 0, 0, nullptr);
+  struct timespec timeout = {0, 0};
+
+  auto Func = [this, event, timeout]() {
+    kevent(GetKqueueFd(), &event, 1, nullptr, 0, &timeout);
+  };
+
+  ExpectRealtimeDeath(Func, "kevent");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(KqueueTest, Kevent64DiesWhenRealtime) {
+  struct kevent64_s event;
+  EV_SET64(&event, 0, EVFILT_READ, EV_ADD, 0, 0, 0, 0, 0);
+  struct timespec timeout = {0, 0};
+
+  auto Func = [this, event, timeout]() {
+    kevent64(GetKqueueFd(), &event, 1, nullptr, 0, 0, &timeout);
+  };
+
+  ExpectRealtimeDeath(Func, "kevent64");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif // SANITIZER_INTERCEPT_KQUEUE
+
+TEST(TestRtsanInterceptors, MkfifoDiesWhenRealtime) {
+  auto Func = []() { mkfifo("/tmp/rtsan_test_fifo", 0); };
+  ExpectRealtimeDeath(Func, "mkfifo");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST(TestRtsanInterceptors, PipeDiesWhenRealtime) {
+  int fds[2];
+  auto Func = [&fds]() { pipe(fds); };
+  ExpectRealtimeDeath(Func, "pipe");
+  ExpectNonRealtimeSurvival(Func);
+}
+
 #endif // SANITIZER_POSIX
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index b8627f8557afe..99fa737adfaf2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -2289,6 +2289,61 @@ INTERCEPTOR(int, pthread_getcpuclockid, uptr thread,
 #define INIT_CLOCK_GETCPUCLOCKID
 #endif
 
+#if SANITIZER_INTERCEPT_TIMER_CREATE
+INTERCEPTOR(int, timer_create, __sanitizer_clockid_t clockid, void *sevp,
+            __sanitizer_timer_t *timer) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, timer_create, clockid, sevp, timer);
+  int res = REAL(timer_create)(clockid, sevp, timer);
+  if (!res && timer) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, timer, sizeof *timer);
+  }
+  return res;
+}
+
+INTERCEPTOR(int, timer_delete, __sanitizer_timer_t timer) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, timer_delete, timer);
+  int res = REAL(timer_delete)(timer);
+  return res;
+}
+
+INTERCEPTOR(int, timer_gettime, __sanitizer_timer_t timer,
+            struct __sanitizer_itimerspec *curr_value) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, timer_gettime, timer, curr_value);
+  int res = REAL(timer_gettime)(timer, curr_value);
+  if (!res && curr_value) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, curr_value, sizeof *curr_value);
+  }
+  return res;
+}
+
+INTERCEPTOR(int, timer_settime, __sanitizer_timer_t timer, int flags,
+            const struct __sanitizer_itimerspec *new_value,
+            struct __sanitizer_itimerspec *old_value) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, timer_settime, timer, flags, new_value,
+                           old_value);
+  int res = REAL(timer_settime)(timer, flags, new_value, old_value);
+  if (!res) {
+    if (new_value)
+      COMMON_INTERCEPTOR_READ_RANGE(ctx, new_value, sizeof *new_value);
+    if (old_value)
+      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, old_value, sizeof *old_value);
+  }
+  return res;
+}
+
+#  define INIT_TIMER_CREATE                                                \
+    COMMON_INTERCEPT_FUNCTION_GLIBC_VER_MIN(timer_create, "GLIBC_2.3.3");  \
+    COMMON_INTERCEPT_FUNCTION_GLIBC_VER_MIN(timer_delete, "GLIBC_2.3.3");  \
+    COMMON_INTERCEPT_FUNCTION_GLIBC_VER_MIN(timer_gettime, "GLIBC_2.3.3"); \
+    COMMON_INTERCEPT_FUNCTION_GLIBC_VER_MIN(timer_settime, "GLIBC_2.3.3");
+#else
+#  define INIT_TIMER_CREATE
+#endif
+
 #if SANITIZER_INTERCEPT_GETITIMER
 INTERCEPTOR(int, getitimer, int which, void *curr_value) {
   void *ctx;
@@ -10266,6 +10321,7 @@ static void InitializeCommonInterceptors() {
   INIT_SETPWENT;
   INIT_CLOCK_GETTIME;
   INIT_CLOCK_GETCPUCLOCKID;
+  INIT_TIMER_CREATE;
   INIT_GETITIMER;
   INIT_TIME;
   INIT_GLOB;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 3fd6b595ef197..1f78b1af8e2c6 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -210,6 +210,8 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_PREAD64 (SI_GLIBC || SI_SOLARIS32)
 #define SANITIZER_INTERCEPT_PWRITE64 (SI_GLIBC || SI_SOLARIS32)
 
+#define SANITIZER_INTERCEPT_LSEEK64 (SI_GLIBC || SI_SOLARIS32)
+
 #define SANITIZER_INTERCEPT_READV SI_POSIX
 #define SANITIZER_INTERCEPT_WRITEV SI_POSIX
 
@@ -256,6 +258,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
   (SI_FREEBSD || SI_NETBSD || SI_LINUX || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_CLOCK_GETCPUCLOCKID \
   (SI_LINUX || SI_FREEBSD || SI_NETBSD)
+// TODO: This should be SI_POSIX, adding glibc first until I have time
+// to verify all timer_t typedefs on other platforms.
+#define SANITIZER_INTERCEPT_TIMER_CREATE SI_GLIBC
 #define SANITIZER_INTERCEPT_GETITIMER SI_POSIX
 #define SANITIZER_INTERCEPT_TIME SI_POSIX
 #define SANITIZER_INTERCEPT_GLOB (SI_GLIBC || SI_SOLARIS)
@@ -339,6 +344,8 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_GETGROUPS SI_POSIX
 #define SANITIZER_INTERCEPT_POLL SI_POSIX
 #define SANITIZER_INTERCEPT_PPOLL SI_LINUX_NOT_ANDROID || SI_SOLARIS
+#define SANITIZER_INTERCEPT_EPOLL (SI_LINUX)
+#define SANITIZER_INTERCEPT_KQUEUE (SI_FREEBSD || SI_NETBSD || SI_MAC)
 #define SANITIZER_INTERCEPT_WORDEXP                                          \
   (SI_FREEBSD || SI_NETBSD || (SI_MAC && !SI_IOS) || SI_LINUX_NOT_ANDROID || \
    SI_SOLARIS)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index e8c81aa8e2816..7d98f8e9a9d80 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -389,6 +389,16 @@ typedef long __sanitizer_time_t;
 
 typedef long __sanitizer_suseconds_t;
 
+struct __sanitizer_timespec {
+  __sanitizer_time_t tv_sec; /* seconds */
+  u64 tv_nsec;               /* nanoseconds */
+};
+
+struct __sanitizer_itimerspec {
+  struct __sanitizer_timespec it_interval; /* timer period */
+  struct __sanitizer_timespec it_value;    /* timer expiration */
+};
+
 struct __sanitizer_timeval {
   __sanitizer_time_t tv_sec;
   __sanitizer_suseconds_t tv_usec;
@@ -1517,6 +1527,10 @@ extern const int si_SEGV_ACCERR;
 
 #define SIGACTION_SYMNAME sigaction
 
+#  if SANITIZER_LINUX
+typedef void *__sanitizer_timer_t;
+#  endif
+
 #endif  // SANITIZER_LINUX || SANITIZER_APPLE
 
 #endif
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp b/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp
new file mode 100644
index 0000000000000..ff05a69a5e7d4
--- /dev/null
+++ b/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp
@@ -0,0 +1,29 @@
+// REQUIRES: target={{.*(darwin|aix).*}}
+
+// RUN: rm -f %t.profraw
+// RUN: %clangxx_pgogen_cont -lpthread %s -o %t.exe -mllvm -disable-vp -fprofile-update=atomic
+// RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe
+// RUN: llvm-profdata show --counts --function=accum  %t.profraw | FileCheck %s
+// CHECK:    Block counts: [100000, 4]
+
+#include <thread>
+
+int x = 0;
+void accum(int n) {
+  for (int i = 0; i < n; i++)
+    x += i; // don't care about accuracy, no need for atomic.
+}
+
+int main() {
+  int init_value = 10000;
+  auto t1 = std::thread(accum, 1*init_value);
+  auto t2 = std::thread(accum, 2*init_value);
+  auto t3 = std::thread(accum, 3*init_value);
+  auto t4 = std::thread(accum, 4*init_value);
+
+  t1.join();
+  t2.join();
+  t3.join();
+  t4.join();
+  return !x;
+}
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
index 35b0cd0b05d1f..54346487a5c79 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
@@ -8,9 +8,9 @@
 // Create two DSOs and a driver program that uses them.
 // RUN: echo "void dso1(void) {}" > dso1.c
 // RUN: echo "void dso2(void) {}" > dso2.c
-// RUN: %clang_pgogen_cont %shared_lib_flag -o %t.dir/dso1.dylib dso1.c -mllvm -instrprof-atomic-counter-update-all=1
-// RUN: %clang_pgogen_cont %shared_lib_flag -o %t.dir/dso2.dylib dso2.c -mllvm -instrprof-atomic-counter-update-all=1
-// RUN: %clang_pgogen_cont -o main.exe %s %t.dir/dso1.dylib %t.dir/dso2.dylib -mllvm -instrprof-atomic-counter-update-all=1
+// RUN: %clang_pgogen_cont %shared_lib_flag -o %t.dir/dso1.dylib dso1.c -fprofile-update=atomic
+// RUN: %clang_pgogen_cont %shared_lib_flag -o %t.dir/dso2.dylib dso2.c -fprofile-update=atomic
+// RUN: %clang_pgogen_cont -o main.exe %s %t.dir/dso1.dylib %t.dir/dso2.dylib -fprofile-update=atomic
 //
 // === Round 1 ===
 // Test merging+continuous mode without any file contention.
diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py
index 7a8877b9f4e50..72a389eaf0dfb 100644
--- a/compiler-rt/test/profile/lit.cfg.py
+++ b/compiler-rt/test/profile/lit.cfg.py
@@ -138,6 +138,14 @@ def exclude_unsupported_files_for_aix(dirname):
 config.substitutions.append(
     ("%clangxx_pgogen=", build_invocation(clang_cxxflags) + " -fprofile-generate=")
 )
+config.substitutions.append(
+    (
+        "%clangxx_pgogen_cont ",
+        build_invocation(clang_cxxflags)
+        + " -fprofile-generate "
+        + ("-mllvm -runtime-counter-relocation " if runtime_reloc else ""),
+    )
+)
 
 config.substitutions.append(
     ("%clang_cspgogen ", build_invocation(clang_cflags) + " -fcs-profile-generate ")
diff --git a/compiler-rt/test/rtsan/report_error_summary.cpp b/compiler-rt/test/rtsan/report_error_summary.cpp
new file mode 100644
index 0000000000000..9da7f217f61bf
--- /dev/null
+++ b/compiler-rt/test/rtsan/report_error_summary.cpp
@@ -0,0 +1,32 @@
+// RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: %env_rtsan_opts="halt_on_error=false" %run %t 2>&1 | FileCheck %s
+
+// RUN: %clangxx -DTEST_CUSTOM_HANDLER=1 -fsanitize=realtime %s -o %t
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-CUSTOM-HANDLER
+
+// UNSUPPORTED: ios
+
+// Intent: Make sure we support ReporErrorSummary, including custom handlers
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef TEST_CUSTOM_HANDLER
+extern "C" void __sanitizer_report_error_summary(const char *error_summary) {
+  fprintf(stderr, "%s %s\n", "In custom handler! ", error_summary);
+}
+#endif
+
+int blocking_call() [[clang::blocking]] { return 0; }
+
+int main() [[clang::nonblocking]] {
+  void *ptr = malloc(2);
+  blocking_call();
+
+  printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc
+}
+
+// CHECK: SUMMARY: RealtimeSanitizer: unsafe-library-call
+// CHECK: SUMMARY: RealtimeSanitizer: blocking-call
+
+// CHECK-CUSTOM-HANDLER: In custom handler! SUMMARY: RealtimeSanitizer: unsafe-library-call
diff --git a/flang/CODE_OWNERS.TXT b/flang/Maintainers.txt
similarity index 100%
rename from flang/CODE_OWNERS.TXT
rename to flang/Maintainers.txt
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index dc68f160f5d92..fe7fee97bc78e 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -475,9 +475,9 @@ struct NodeVisitor {
   READ_FEATURE(OmpDoacross::Source)
   READ_FEATURE(OmpDoacrossClause)
   READ_FEATURE(OmpDependenceType)
-  READ_FEATURE(OmpDependenceType::Type)
+  READ_FEATURE(OmpDependenceType::Value)
   READ_FEATURE(OmpTaskDependenceType)
-  READ_FEATURE(OmpTaskDependenceType::Type)
+  READ_FEATURE(OmpTaskDependenceType::Value)
   READ_FEATURE(OmpIteration)
   READ_FEATURE(OmpIterationOffset)
   READ_FEATURE(OmpIterationVector)
@@ -495,7 +495,7 @@ struct NodeVisitor {
   READ_FEATURE(OmpLinearClause::WithModifier)
   READ_FEATURE(OmpLinearClause::WithoutModifier)
   READ_FEATURE(OmpLinearModifier)
-  READ_FEATURE(OmpLinearModifier::Type)
+  READ_FEATURE(OmpLinearModifier::Value)
   READ_FEATURE(OmpLoopDirective)
   READ_FEATURE(OmpMapClause)
   READ_FEATURE(OmpMapClause::TypeModifier)
@@ -515,7 +515,7 @@ struct NodeVisitor {
   READ_FEATURE(OmpReductionCombiner)
   READ_FEATURE(OmpReductionCombiner::FunctionCombiner)
   READ_FEATURE(OmpReductionInitializerClause)
-  READ_FEATURE(OmpReductionOperator)
+  READ_FEATURE(OmpReductionIdentifier)
   READ_FEATURE(OmpAllocateClause)
   READ_FEATURE(OmpAllocateClause::AllocateModifier)
   READ_FEATURE(OmpAllocateClause::AllocateModifier::Allocator)
diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
index d28ed0534d600..c184fdafb5c33 100644
--- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
+++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
@@ -218,11 +218,11 @@ void OpenMPCounterVisitor::Post(const OmpScheduleModifierType::ModType &c) {
   clauseDetails +=
       "modifier=" + std::string{OmpScheduleModifierType::EnumToString(c)} + ";";
 }
-void OpenMPCounterVisitor::Post(const OmpLinearModifier::Type &c) {
+void OpenMPCounterVisitor::Post(const OmpLinearModifier::Value &c) {
   clauseDetails +=
       "modifier=" + std::string{OmpLinearModifier::EnumToString(c)} + ";";
 }
-void OpenMPCounterVisitor::Post(const OmpTaskDependenceType::Type &c) {
+void OpenMPCounterVisitor::Post(const OmpTaskDependenceType::Value &c) {
   clauseDetails +=
       "type=" + std::string{OmpTaskDependenceType::EnumToString(c)} + ";";
 }
diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.h b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.h
index 68c52db46e2f0..6c2d194a88e69 100644
--- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.h
+++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.h
@@ -72,8 +72,8 @@ struct OpenMPCounterVisitor {
   void Post(const OmpDefaultmapClause::VariableCategory &c);
   void Post(const OmpDeviceTypeClause::Type &c);
   void Post(const OmpScheduleModifierType::ModType &c);
-  void Post(const OmpLinearModifier::Type &c);
-  void Post(const OmpTaskDependenceType::Type &c);
+  void Post(const OmpLinearModifier::Value &c);
+  void Post(const OmpTaskDependenceType::Value &c);
   void Post(const OmpMapClause::Type &c);
   void Post(const OmpScheduleClause::ScheduleType &c);
   void Post(const OmpIfClause::DirectiveNameModifier &c);
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index a8a6eb922a045..6261a4eec4a55 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1416,6 +1416,8 @@ common::IgnoreTKRSet GetIgnoreTKR(const Symbol &);
 
 std::optional<int> GetDummyArgumentNumber(const Symbol *);
 
+const Symbol *FindAncestorModuleProcedure(const Symbol *symInSubmodule);
+
 } // namespace Fortran::semantics
 
 #endif // FORTRAN_EVALUATE_TOOLS_H_
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 6b41025eea078..f073f494b3fb2 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -357,8 +357,8 @@ hlfir::ElementalOp genElementalOp(
 
 /// Structure to describe a loop nest.
 struct LoopNest {
-  fir::DoLoopOp outerLoop;
-  fir::DoLoopOp innerLoop;
+  mlir::Operation *outerOp = nullptr;
+  mlir::Block *body = nullptr;
   llvm::SmallVector<mlir::Value> oneBasedIndices;
 };
 
@@ -366,11 +366,13 @@ struct LoopNest {
 /// \p isUnordered specifies whether the loops in the loop nest
 /// are unordered.
 LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
-                     mlir::ValueRange extents, bool isUnordered = false);
+                     mlir::ValueRange extents, bool isUnordered = false,
+                     bool emitWorkshareLoop = false);
 inline LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
-                            mlir::Value shape, bool isUnordered = false) {
+                            mlir::Value shape, bool isUnordered = false,
+                            bool emitWorkshareLoop = false) {
   return genLoopNest(loc, builder, getIndexExtents(loc, builder, shape),
-                     isUnordered);
+                     isUnordered, emitWorkshareLoop);
 }
 
 /// Inline the body of an hlfir.elemental at the current insertion point
diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h
index 403d79667bf44..feb395f1a12db 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.h
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.h
@@ -25,6 +25,11 @@ namespace flangomp {
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/OpenMP/Passes.h.inc"
 
+/// Impelements the logic specified in the 2.8.3  workshare Construct section of
+/// the OpenMP standard which specifies what statements or constructs shall be
+/// divided into units of work.
+bool shouldUseWorkshareLowering(mlir::Operation *op);
+
 } // namespace flangomp
 
 #endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H
diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index c070bc22ff20c..37977334c1e9e 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -50,4 +50,9 @@ def FunctionFilteringPass : Pass<"omp-function-filtering"> {
   ];
 }
 
+// Needs to be scheduled on Module as we create functions in it
+def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> {
+  let summary = "Lower workshare construct";
+}
+
 #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index 3b54ac3883858..55fafc2e6b36f 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -123,7 +123,8 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
 void createHLFIRToFIRPassPipeline(
-    mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel);
+    mlir::PassManager &pm, bool enableOpenMP,
+    llvm::OptimizationLevel optLevel = defaultOptLevel);
 
 /// Create a pass pipeline for handling certain OpenMP transformations needed
 /// prior to FIR lowering.
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 5886e384b986b..63fddc424182b 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -477,7 +477,7 @@ class ParseTreeDumper {
   NODE(parser, ObjectDecl)
   NODE(parser, OldParameterStmt)
   NODE(parser, OmpIteratorSpecifier)
-  NODE(parser, OmpIteratorModifier)
+  NODE(parser, OmpIterator)
   NODE(parser, OmpAffinityClause)
   NODE(parser, OmpAlignedClause)
   NODE(parser, OmpAtomic)
@@ -513,9 +513,9 @@ class ParseTreeDumper {
   NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior)
   NODE_ENUM(OmpDefaultmapClause, VariableCategory)
   NODE(parser, OmpDependenceType)
-  NODE_ENUM(OmpDependenceType, Type)
+  NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
-  NODE_ENUM(OmpTaskDependenceType, Type)
+  NODE_ENUM(OmpTaskDependenceType, Value)
   NODE(parser, OmpIterationOffset)
   NODE(parser, OmpIteration)
   NODE(parser, OmpIterationVector)
@@ -543,9 +543,10 @@ class ParseTreeDumper {
   NODE(OmpLinearClause, WithModifier)
   NODE(OmpLinearClause, WithoutModifier)
   NODE(parser, OmpLinearModifier)
-  NODE_ENUM(OmpLinearModifier, Type)
+  NODE_ENUM(OmpLinearModifier, Value)
   NODE(parser, OmpLoopDirective)
   NODE(parser, OmpMapClause)
+  NODE(parser, OmpMapperIdentifier)
   NODE_ENUM(OmpMapClause, TypeModifier)
   NODE_ENUM(OmpMapClause, Type)
   static std::string GetNodeName(const llvm::omp::Clause &x) {
@@ -573,7 +574,7 @@ class ParseTreeDumper {
   NODE(parser, OmpReductionCombiner)
   NODE(OmpReductionCombiner, FunctionCombiner)
   NODE(parser, OmpReductionInitializerClause)
-  NODE(parser, OmpReductionOperator)
+  NODE(parser, OmpReductionIdentifier)
   NODE(parser, OmpAllocateClause)
   NODE(OmpAllocateClause, AllocateModifier)
   NODE(OmpAllocateClause::AllocateModifier, Allocator)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 8f50809599a58..22b7f9acd1af5 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3440,13 +3440,33 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list<OmpObject>);
 
+inline namespace modifier {
+// For uniformity, in all keyword modifiers the name of the type defined
+// by ENUM_CLASS is "Value", e.g.
+// struct Foo {
+//   ENUM_CLASS(Value, Keyword1, Keyword2);
+// };
+
+// Ref: [5.0:47-49], [5.1:49-51], [5.2:67-69]
+//
+// iterator-specifier ->
+//    [iterator-type] iterator-identifier
+//        = range-specification |                   // since 5.0
+//    [iterator-type ::] iterator-identifier
+//        = range-specification                     // since 5.2
+struct OmpIteratorSpecifier {
+  TUPLE_CLASS_BOILERPLATE(OmpIteratorSpecifier);
+  CharBlock source;
+  std::tuple<TypeDeclarationStmt, SubscriptTriplet> t;
+};
+
 // Ref: [4.5:169-170], [5.0:255-256], [5.1:288-289]
 //
 // dependence-type ->
-//    SINK | SOURCE |           // since 4.5
-//    IN | OUT | INOUT |        // since 4.5, until 5.1
-//    MUTEXINOUTSET | DEPOBJ |  // since 5.0, until 5.1
-//    INOUTSET                  // since 5.1, until 5.1
+//    SINK | SOURCE |                               // since 4.5
+//    IN | OUT | INOUT |                            // since 4.5, until 5.1
+//    MUTEXINOUTSET | DEPOBJ |                      // since 5.0, until 5.1
+//    INOUTSET                                      // since 5.1, until 5.1
 //
 // All of these, except SINK and SOURCE became task-dependence-type in 5.2.
 //
@@ -3457,45 +3477,59 @@ WRAPPER_CLASS(OmpObjectList, std::list<OmpObject>);
 // vector). This would accept the vector "i, j, k" (although interpreted
 // incorrectly), while flagging a syntax error for "i+1, j, k".
 struct OmpDependenceType {
-  ENUM_CLASS(Type, Sink, Source);
-  WRAPPER_CLASS_BOILERPLATE(OmpDependenceType, Type);
+  ENUM_CLASS(Value, Sink, Source);
+  WRAPPER_CLASS_BOILERPLATE(OmpDependenceType, Value);
 };
 
-// Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
+// Ref: [5.0:47-49], [5.1:49-51], [5.2:67-69]
 //
-// task-dependence-type -> // "dependence-type" in 5.1 and before
-//    IN | OUT | INOUT |        // since 4.5
-//    MUTEXINOUTSET | DEPOBJ |  // since 5.0
-//    INOUTSET                  // since 5.2
-struct OmpTaskDependenceType {
-  ENUM_CLASS(Type, In, Out, Inout, Inoutset, Mutexinoutset, Depobj)
-  WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Type);
+// iterator-modifier ->
+//    ITERATOR(iterator-specifier [, ...])          // since 5.0
+struct OmpIterator {
+  WRAPPER_CLASS_BOILERPLATE(OmpIterator, std::list<OmpIteratorSpecifier>);
 };
 
-// [5.0] 2.1.6 iterator-specifier -> type-declaration-stmt = subscript-triple
-//             iterator-modifier -> iterator-specifier-list
-struct OmpIteratorSpecifier {
-  TUPLE_CLASS_BOILERPLATE(OmpIteratorSpecifier);
-  CharBlock source;
-  std::tuple<TypeDeclarationStmt, SubscriptTriplet> t;
+// Ref: [4.5:207-210], [5.0:290-293], [5.1:323-325], [5.2:117-120]
+//
+// linear-modifier ->
+//    REF | UVAL | VAL                              // since 4.5
+struct OmpLinearModifier {
+  ENUM_CLASS(Value, Ref, Uval, Val);
+  WRAPPER_CLASS_BOILERPLATE(OmpLinearModifier, Value);
 };
 
-WRAPPER_CLASS(OmpIteratorModifier, std::list<OmpIteratorSpecifier>);
-
-// 2.15.3.6 reduction-identifier -> + | - | * | .AND. | .OR. | .EQV. | .NEQV. |
-//                         MAX | MIN | IAND | IOR | IEOR
-struct OmpReductionOperator {
-  UNION_CLASS_BOILERPLATE(OmpReductionOperator);
+// Ref: [4.5:201-207], [5.0:293-299], [5.1:325-331], [5.2:124]
+//
+// reduction-identifier ->
+//   base-language-identifier |                     // since 4.5
+//   - |                                            // since 4.5, until 5.2
+//   + | * | .AND. | .OR. | .EQV. | .NEQV. |        // since 4.5
+//   MIN | MAX | IAND | IOR | IEOR                  // since 4.5
+//
+struct OmpReductionIdentifier {
+  UNION_CLASS_BOILERPLATE(OmpReductionIdentifier);
   std::variant<DefinedOperator, ProcedureDesignator> u;
 };
 
+// Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
+//
+// task-dependence-type -> // "dependence-type" in 5.1 and before
+//    IN | OUT | INOUT |                            // since 4.5
+//    MUTEXINOUTSET | DEPOBJ |                      // since 5.0
+//    INOUTSET                                      // since 5.2
+struct OmpTaskDependenceType {
+  ENUM_CLASS(Value, In, Out, Inout, Inoutset, Mutexinoutset, Depobj)
+  WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Value);
+};
+} // namespace modifier
+
 // --- Clauses
 
 // OMP 5.0 2.10.1 affinity([aff-modifier:] locator-list)
 //                aff-modifier: interator-modifier
 struct OmpAffinityClause {
   TUPLE_CLASS_BOILERPLATE(OmpAffinityClause);
-  std::tuple<std::optional<OmpIteratorModifier>, OmpObjectList> t;
+  std::tuple<std::optional<OmpIterator>, OmpObjectList> t;
 };
 
 // 2.8.1 aligned-clause -> ALIGNED (variable-name-list[ : scalar-constant])
@@ -3566,7 +3600,7 @@ WRAPPER_CLASS(OmpIterationVector, std::list<OmpIteration>);
 // OmpDoacrossClause), so that the context in TYPE_CONTEXT_PARSER can be set
 // separately for OmpDependClause and OmpDoacrossClause.
 struct OmpDoacross {
-  OmpDependenceType::Type GetDepType() const;
+  OmpDependenceType::Value GetDepType() const;
 
   WRAPPER_CLASS(Sink, OmpIterationVector);
   EMPTY_CLASS(Source);
@@ -3586,10 +3620,9 @@ struct OmpDoacross {
 struct OmpDependClause {
   UNION_CLASS_BOILERPLATE(OmpDependClause);
   struct TaskDep {
-    OmpTaskDependenceType::Type GetTaskDepType() const;
+    OmpTaskDependenceType::Value GetTaskDepType() const;
     TUPLE_CLASS_BOILERPLATE(TaskDep);
-    std::tuple<std::optional<OmpIteratorModifier>, OmpTaskDependenceType,
-        OmpObjectList>
+    std::tuple<std::optional<OmpIterator>, OmpTaskDependenceType, OmpObjectList>
         t;
   };
   std::variant<TaskDep, OmpDoacross> u;
@@ -3632,7 +3665,7 @@ struct OmpFromClause {
   // As in the case of MAP, modifiers are parsed as lists, even if they
   // are unique. These restrictions will be checked in semantic checks.
   std::tuple<std::optional<std::list<Expectation>>,
-      std::optional<std::list<OmpIteratorModifier>>, OmpObjectList,
+      std::optional<std::list<OmpIterator>>, OmpObjectList,
       bool> // were the modifiers comma-separated?
       t;
 };
@@ -3661,7 +3694,7 @@ struct OmpDetachClause {
 //                                         variable-name-list)
 struct OmpInReductionClause {
   TUPLE_CLASS_BOILERPLATE(OmpInReductionClause);
-  std::tuple<OmpReductionOperator, OmpObjectList> t;
+  std::tuple<OmpReductionIdentifier, OmpObjectList> t;
 };
 
 // OMP 5.0 2.19.4.5 lastprivate-clause ->
@@ -3673,12 +3706,6 @@ struct OmpLastprivateClause {
   std::tuple<std::optional<LastprivateModifier>, OmpObjectList> t;
 };
 
-// 2.15.3.7 linear-modifier -> REF | VAL | UVAL
-struct OmpLinearModifier {
-  ENUM_CLASS(Type, Ref, Val, Uval)
-  WRAPPER_CLASS_BOILERPLATE(OmpLinearModifier, Type);
-};
-
 // 2.15.3.7 linear-clause -> LINEAR (linear-list[ : linear-step])
 //          linear-list -> list | linear-modifier(list)
 struct OmpLinearClause {
@@ -3703,8 +3730,11 @@ struct OmpLinearClause {
   std::variant<WithModifier, WithoutModifier> u;
 };
 
+WRAPPER_CLASS(OmpMapperIdentifier, std::optional<Name>);
+
 // 2.15.5.1 map ->
-//    MAP ([[map-type-modifier-list [,]] [iterator-modifier [,]] map-type : ]
+//    MAP ([MAPPER(mapper-identifier)] [[map-type-modifier-list [,]]
+//    [iterator-modifier [,]] map-type : ]
 //         variable-name-list)
 // map-type-modifier-list -> map-type-modifier [,] [...]
 // map-type-modifier -> ALWAYS | CLOSE | PRESENT | OMPX_HOLD
@@ -3718,8 +3748,9 @@ struct OmpMapClause {
   // The checks for satisfying those constraints are deferred to semantics.
   // In OpenMP 5.2 the non-comma syntax has been deprecated: keep the
   // information about separator presence to emit a diagnostic if needed.
-  std::tuple<std::optional<std::list<TypeModifier>>,
-      std::optional<std::list<OmpIteratorModifier>>, // unique
+  std::tuple<OmpMapperIdentifier, // Mapper name
+      std::optional<std::list<TypeModifier>>,
+      std::optional<std::list<OmpIterator>>, // unique
       std::optional<std::list<Type>>, // unique
       OmpObjectList,
       bool> // were the modifiers comma-separated?
@@ -3749,7 +3780,7 @@ struct OmpProcBindClause {
 struct OmpReductionClause {
   TUPLE_CLASS_BOILERPLATE(OmpReductionClause);
   ENUM_CLASS(ReductionModifier, Inscan, Task, Default)
-  std::tuple<std::optional<ReductionModifier>, OmpReductionOperator,
+  std::tuple<std::optional<ReductionModifier>, OmpReductionIdentifier,
       OmpObjectList>
       t;
 };
@@ -3794,7 +3825,7 @@ struct OmpToClause {
   // As in the case of MAP, modifiers are parsed as lists, even if they
   // are unique. These restrictions will be checked in semantic checks.
   std::tuple<std::optional<std::list<Expectation>>,
-      std::optional<std::list<OmpIteratorModifier>>, OmpObjectList,
+      std::optional<std::list<OmpIterator>>, OmpObjectList,
       bool> // were the modifiers comma-separated?
       t;
 };
@@ -3942,7 +3973,7 @@ WRAPPER_CLASS(OmpReductionInitializerClause, Expr);
 struct OpenMPDeclareReductionConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenMPDeclareReductionConstruct);
   CharBlock source;
-  std::tuple<Verbatim, OmpReductionOperator, std::list<DeclarationTypeSpec>,
+  std::tuple<Verbatim, OmpReductionIdentifier, std::list<DeclarationTypeSpec>,
       OmpReductionCombiner, std::optional<OmpReductionInitializerClause>>
       t;
 };
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h b/flang/include/flang/Semantics/openmp-modifiers.h
new file mode 100644
index 0000000000000..65d28f71fbc72
--- /dev/null
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -0,0 +1,391 @@
+//===-- flang/lib/Semantics/openmp-modifiers.h ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
+#define FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
+
+#include "flang/Common/enum-set.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/semantics.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+
+#include <cassert>
+#include <map>
+#include <optional>
+#include <variant>
+
+namespace Fortran::semantics {
+
+// Ref: [5.2:58]
+//
+// Syntactic properties for Clauses, Arguments and Modifiers
+//
+// Inverse properties:
+//   not Required  -> Optional
+//   not Unique    -> Repeatable
+//   not Exclusive -> Compatible
+//   not Ultimate  -> Free
+//
+// Clause defaults:   Optional, Repeatable, Compatible, Free
+// Argument defaults: Required,     Unique, Compatible, Free
+// Modifier defaults: Optional,     Unique, Compatible, Free
+//
+// ---
+// Each modifier is used as either pre-modifier (i.e. modifier: item),
+// or post-modifier (i.e. item: modifier). The default is pre-.
+// Add an additional property that reflects the type of modifier.
+
+ENUM_CLASS(OmpProperty, Required, Unique, Exclusive, Ultimate, Post)
+using OmpProperties = common::EnumSet<OmpProperty, OmpProperty_enumSize>;
+using OmpClauses =
+    common::EnumSet<llvm::omp::Clause, llvm::omp::Clause_enumSize>;
+
+struct OmpModifierDescriptor {
+  // Modifier name for use in diagnostic messages.
+  const OmpProperties &props(unsigned version) const;
+  const OmpClauses &clauses(unsigned version) const;
+
+  const llvm::StringRef name;
+  // Version-dependent properties of the modifier.
+  const std::map<unsigned, OmpProperties> props_;
+  // Version-dependent set of clauses to which the modifier can apply.
+  const std::map<unsigned, OmpClauses> clauses_;
+};
+
+template <typename SpecificTy> const OmpModifierDescriptor &OmpGetDescriptor();
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpDependenceType>();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpIterator>();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpLinearModifier>();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpReductionIdentifier>();
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpTaskDependenceType>();
+
+// Explanation of terminology:
+//
+// A typical clause with modifier[s] looks like this (with parts that are
+// not relevant here removed):
+//   struct OmpSomeClause {
+//     struct Modifier {
+//       using Variant = std::variant<Specific1, Specific2...>;
+//       Variant u;
+//     };
+//     std::tuple<std::optional<std::list<Modifier>>, ...> t;
+//   };
+//
+// The Speficic1, etc. refer to parser classes that represent modifiers,
+// e.g. OmpIterator or OmpTaskDependenceType. The Variant type contains
+// all modifiers that are allowed for a given clause. The Modifier class
+// is there to wrap the variant into the form that the parse tree visitor
+// expects, i.e. with traits, member "u", etc.
+//
+// To avoid ambiguities with the word "modifier" (e.g. is it "any modifier",
+// or "this specific modifier"?), the following code uses different terms:
+//
+// - UnionTy:    refers to the nested "Modifier" class, i.e.
+//               "OmpSomeClause::Modifier" in the example above.
+// - SpecificTy: refers to any of the alternatives, i.e. "Specific1" or
+//               "Specific2".
+
+template <typename UnionTy>
+const OmpModifierDescriptor &OmpGetDescriptor(const UnionTy &modifier) {
+  return common::visit(
+      [](auto &&m) -> decltype(auto) {
+        using SpecificTy = llvm::remove_cvref_t<decltype(m)>;
+        return OmpGetDescriptor<SpecificTy>();
+      },
+      modifier.u);
+}
+
+/// Return the optional list of modifiers for a given `Omp[...]Clause`.
+/// Specifically, the parameter type `ClauseTy` is the class that OmpClause::v
+/// holds.
+template <typename ClauseTy>
+const std::optional<std::list<typename ClauseTy::Modifier>> &OmpGetModifiers(
+    const ClauseTy &clause) {
+  using UnionTy = typename ClauseTy::Modifier;
+  return std::get<std::optional<std::list<UnionTy>>>(clause.t);
+}
+
+namespace detail {
+/// Finds the first entry in the iterator range that holds the `SpecificTy`
+/// alternative, or the end iterator if it does not exist.
+/// The `SpecificTy` should be provided, the `UnionTy` is expected to be
+/// auto-deduced, e.g.
+///   const std::optional<std::list<X>> &modifiers = ...
+///   ... = findInRange<OmpIterator>(modifiers->begin(), modifiers->end());
+template <typename SpecificTy, typename UnionTy>
+typename std::list<UnionTy>::const_iterator findInRange(
+    typename std::list<UnionTy>::const_iterator begin,
+    typename std::list<UnionTy>::const_iterator end) {
+  for (auto it{begin}; it != end; ++it) {
+    if (std::holds_alternative<SpecificTy>(it->u)) {
+      return it;
+    }
+  }
+  return end;
+}
+} // namespace detail
+
+/// Finds the entry in the list that holds the `SpecificTy` alternative,
+/// and returns the pointer to that alternative. If such an entry does not
+/// exist, it returns nullptr.
+/// The list is assumed to contain at most one such item, with a check
+/// whether the condition is met.
+/// This function should only be called after the verification of modifier
+/// properties has been performed, since it will assert if multiple items
+/// are found.
+template <typename SpecificTy, typename UnionTy>
+const SpecificTy *OmpGetUniqueModifier(
+    const std::optional<std::list<UnionTy>> &modifiers) {
+  const SpecificTy *found{nullptr};
+  if (modifiers) {
+    auto end{modifiers->cend()};
+    // typename std::list<UnionTy>::iterator end{modifiers->end()};
+    auto at{detail::findInRange<SpecificTy, UnionTy>(modifiers->cbegin(), end)};
+    if (at != end) {
+      found = &std::get<SpecificTy>(at->u);
+#ifndef NDEBUG
+      auto another{
+          detail::findInRange<SpecificTy, UnionTy>(std::next(at), end)};
+      assert(another == end && "repeated modifier");
+#endif
+    }
+  }
+  return found;
+}
+
+namespace detail {
+template <typename T> constexpr const T *make_nullptr() {
+  return static_cast<const T *>(nullptr);
+}
+
+/// Helper function for verifying the Required property:
+/// For a specific SpecificTy, if SpecificTy is has the Required property,
+/// check if the list has an item that holds SpecificTy as an alternative.
+/// If SpecificTy does not have the Required property, ignore it.
+template <typename SpecificTy, typename UnionTy>
+bool verifyIfRequired(const SpecificTy *,
+    const std::optional<std::list<UnionTy>> &modifiers,
+    parser::CharBlock clauseSource, SemanticsContext &semaCtx) {
+  unsigned version{semaCtx.langOptions().OpenMPVersion};
+  const OmpModifierDescriptor &desc{OmpGetDescriptor<SpecificTy>()};
+  if (!desc.props(version).test(OmpProperty::Required)) {
+    // If the modifier is not required, there is nothing to do.
+    return true;
+  }
+  bool present{modifiers.has_value()};
+  present = present && llvm::any_of(*modifiers, [](auto &&m) {
+    return std::holds_alternative<SpecificTy>(m.u);
+  });
+  if (!present) {
+    semaCtx.Say(
+        clauseSource, "A %s modifier is required"_err_en_US, desc.name.str());
+  }
+  return present;
+}
+
+/// Helper function for verifying the Required property:
+/// Visit all specific types in UnionTy, and verify the Required property
+/// for each one of them.
+template <typename UnionTy, size_t... Idxs>
+bool verifyRequiredPack(const std::optional<std::list<UnionTy>> &modifiers,
+    parser::CharBlock clauseSource, SemanticsContext &semaCtx,
+    std::integer_sequence<size_t, Idxs...>) {
+  using VariantTy = typename UnionTy::Variant;
+  return (verifyIfRequired(
+              make_nullptr<std::variant_alternative_t<Idxs, VariantTy>>(),
+              modifiers, clauseSource, semaCtx) &&
+      ...);
+}
+
+/// Verify the Required property for the given list. Return true if the
+/// list is valid, or false otherwise.
+template <typename UnionTy>
+bool verifyRequired(const std::optional<std::list<UnionTy>> &modifiers,
+    parser::CharBlock clauseSource, SemanticsContext &semaCtx) {
+  using VariantTy = typename UnionTy::Variant;
+  return verifyRequiredPack(modifiers, clauseSource, semaCtx,
+      std::make_index_sequence<std::variant_size_v<VariantTy>>{});
+}
+
+/// Helper function to verify the Unique property.
+/// If SpecificTy has the Unique property, and an item is found holding
+/// it as the alternative, verify that none of the elements that follow
+/// hold SpecificTy as the alternative.
+template <typename UnionTy, typename SpecificTy>
+bool verifyIfUnique(const SpecificTy *,
+    typename std::list<UnionTy>::const_iterator specific,
+    typename std::list<UnionTy>::const_iterator end,
+    SemanticsContext &semaCtx) {
+  // `specific` is the location of the modifier of type SpecificTy.
+  assert(specific != end && "`specific` must be a valid location");
+
+  unsigned version{semaCtx.langOptions().OpenMPVersion};
+  const OmpModifierDescriptor &desc{OmpGetDescriptor<SpecificTy>()};
+  // Ultimate implies Unique.
+  if (!desc.props(version).test(OmpProperty::Unique) &&
+      !desc.props(version).test(OmpProperty::Ultimate)) {
+    return true;
+  }
+  if (std::next(specific) != end) {
+    auto next{
+        detail::findInRange<SpecificTy, UnionTy>(std::next(specific), end)};
+    if (next != end) {
+      semaCtx.Say(next->source, "A %s cannot occur multiple times"_err_en_US,
+          desc.name.str());
+    }
+  }
+  return true;
+}
+
+/// Verify the Unique property for the given list. Return true if the
+/// list is valid, or false otherwise.
+template <typename UnionTy>
+bool verifyUnique(const std::optional<std::list<UnionTy>> &modifiers,
+    parser::CharBlock clauseSource, SemanticsContext &semaCtx) {
+  if (!modifiers) {
+    return true;
+  }
+  bool result{true};
+  for (auto it{modifiers->cbegin()}, end{modifiers->cend()}; it != end; ++it) {
+    result = common::visit(
+                 [&](auto &&m) {
+                   return verifyIfUnique<UnionTy>(&m, it, end, semaCtx);
+                 },
+                 it->u) &&
+        result;
+  }
+  return result;
+}
+
+/// Verify the Ultimate property for the given list. Return true if the
+/// list is valid, or false otherwise.
+template <typename UnionTy>
+bool verifyUltimate(const std::optional<std::list<UnionTy>> &modifiers,
+    parser::CharBlock clauseSource, SemanticsContext &semaCtx) {
+  if (!modifiers || modifiers->size() <= 1) {
+    return true;
+  }
+  unsigned version{semaCtx.langOptions().OpenMPVersion};
+  bool result{true};
+  auto first{modifiers->cbegin()};
+  auto last{std::prev(modifiers->cend())};
+
+  // Any item that has the Ultimate property has to be either at the back
+  // or at the front of the list (depending on whether it's a pre- or a post-
+  // modifier).
+  // Walk over the list, and if a given item has the Ultimate property but is
+  // not at the right position, mark it as an error.
+  for (auto it{first}, end{modifiers->cend()}; it != end; ++it) {
+    result =
+        common::visit(
+            [&](auto &&m) {
+              using SpecificTy = llvm::remove_cvref_t<decltype(m)>;
+              const OmpModifierDescriptor &desc{OmpGetDescriptor<SpecificTy>()};
+              auto &props{desc.props(version)};
+
+              if (props.test(OmpProperty::Ultimate)) {
+                bool isPre = !props.test(OmpProperty::Post);
+                if (it == (isPre ? last : first)) {
+                  // Skip, since this is the correct place for this modifier.
+                  return true;
+                }
+                llvm::StringRef where{isPre ? "last" : "first"};
+                semaCtx.Say(it->source,
+                    "The %s should be the %s modifier"_err_en_US,
+                    desc.name.str(), where.str());
+                return false;
+              }
+              return true;
+            },
+            it->u) &&
+        result;
+  }
+  return result;
+}
+
+/// Verify the Exclusive property for the given list. Return true if the
+/// list is valid, or false otherwise.
+template <typename UnionTy>
+bool verifyExclusive(const std::optional<std::list<UnionTy>> &modifiers,
+    parser::CharBlock clauseSource, SemanticsContext &semaCtx) {
+  if (!modifiers || modifiers->size() <= 1) {
+    return true;
+  }
+  unsigned version{semaCtx.langOptions().OpenMPVersion};
+  const UnionTy &front{modifiers->front()};
+  const OmpModifierDescriptor &frontDesc{OmpGetDescriptor(front)};
+
+  auto second{std::next(modifiers->cbegin())};
+  auto end{modifiers->end()};
+
+  auto emitErrorMessage{[&](const UnionTy &excl, const UnionTy &other) {
+    const OmpModifierDescriptor &descExcl{OmpGetDescriptor(excl)};
+    const OmpModifierDescriptor &descOther{OmpGetDescriptor(other)};
+    parser::MessageFormattedText txt(
+        "An exclusive %s cannot be specified together with a modifier of a different type"_err_en_US,
+        descExcl.name.str());
+    parser::Message message(excl.source, txt);
+    message.Attach(
+        other.source, "%s provided here"_en_US, descOther.name.str());
+    semaCtx.Say(std::move(message));
+  }};
+
+  if (frontDesc.props(version).test(OmpProperty::Exclusive)) {
+    // If the first item has the Exclusive property, then check if there is
+    // another item in the rest of the list with a different SpecificTy as
+    // the alternative, and mark it as an error. This allows multiple Exclusive
+    // items to coexist as long as they hold the same SpecificTy.
+    bool result{true};
+    size_t frontIndex{front.u.index()};
+    for (auto it{second}; it != end; ++it) {
+      if (it->u.index() != frontIndex) {
+        emitErrorMessage(front, *it);
+        result = false;
+        break;
+      }
+    }
+    return result;
+  } else {
+    // If the first item does not have the Exclusive property, then check
+    // if there is an item in the rest of the list that is Exclusive, and
+    // mark it as an error if so.
+    bool result{true};
+    for (auto it{second}; it != end; ++it) {
+      const OmpModifierDescriptor &desc{OmpGetDescriptor(*it)};
+      if (desc.props(version).test(OmpProperty::Exclusive)) {
+        emitErrorMessage(*it, front);
+        result = false;
+        break;
+      }
+    }
+    return result;
+  }
+}
+} // namespace detail
+
+template <typename ClauseTy>
+bool OmpVerifyModifiers(const ClauseTy &clause, parser::CharBlock clauseSource,
+    SemanticsContext &semaCtx) {
+  auto &modifiers{OmpGetModifiers(clause)};
+  bool result{detail::verifyRequired(modifiers, clauseSource, semaCtx)};
+  result = detail::verifyUnique(modifiers, clauseSource, semaCtx) && result;
+  result = detail::verifyUltimate(modifiers, clauseSource, semaCtx) && result;
+  result = detail::verifyExclusive(modifiers, clauseSource, semaCtx) && result;
+  return result;
+}
+} // namespace Fortran::semantics
+
+#endif // FORTRAN_SEMANTICS_OPENMP_MODIFIERS_H_
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index df4b21ada058f..d936b739e5815 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -123,6 +123,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
       false; ///< Set no-signed-zeros-fp-math attribute for functions.
   bool UnsafeFPMath = false; ///< Set unsafe-fp-math attribute for functions.
   bool NSWOnLoopVarInc = false; ///< Add nsw flag to loop variable increments.
+  bool EnableOpenMP = false; ///< Enable OpenMP lowering.
 };
 
 struct OffloadModuleOpts {
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index 78cc63d0fde40..324d6b8dde73b 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -731,11 +731,16 @@ static std::optional<Procedure> CharacterizeProcedure(
               return std::optional<Procedure>{};
             }
           },
-          [&](const semantics::EntityDetails &) {
+          [&](const semantics::EntityDetails &x) {
             CheckForNested(symbol);
             return std::optional<Procedure>{};
           },
           [&](const semantics::SubprogramNameDetails &) {
+            if (const semantics::Symbol *
+                ancestor{FindAncestorModuleProcedure(&symbol)}) {
+              return CharacterizeProcedure(
+                  *ancestor, context, seenProcs, emitError);
+            }
             CheckForNested(symbol);
             return std::optional<Procedure>{};
           },
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 4d98220a7065c..15e3e9452894d 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1990,4 +1990,37 @@ std::optional<int> GetDummyArgumentNumber(const Symbol *symbol) {
   return std::nullopt;
 }
 
+// Given a symbol that is a SubprogramNameDetails in a submodule, try to
+// find its interface definition in its module or ancestor submodule.
+const Symbol *FindAncestorModuleProcedure(const Symbol *symInSubmodule) {
+  if (symInSubmodule && symInSubmodule->owner().IsSubmodule()) {
+    if (const auto *nameDetails{
+            symInSubmodule->detailsIf<semantics::SubprogramNameDetails>()};
+        nameDetails &&
+        nameDetails->kind() == semantics::SubprogramKind::Module) {
+      const Symbol *next{symInSubmodule->owner().symbol()};
+      while (const Symbol * submodSym{next}) {
+        next = nullptr;
+        if (const auto *modDetails{
+                submodSym->detailsIf<semantics::ModuleDetails>()};
+            modDetails && modDetails->isSubmodule() && modDetails->scope()) {
+          if (const semantics::Scope & parent{modDetails->scope()->parent()};
+              parent.IsSubmodule() || parent.IsModule()) {
+            if (auto iter{parent.find(symInSubmodule->name())};
+                iter != parent.end()) {
+              const Symbol &proc{iter->second->GetUltimate()};
+              if (IsProcedure(proc)) {
+                return &proc;
+              }
+            } else if (parent.IsSubmodule()) {
+              next = parent.symbol();
+            }
+          }
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
 } // namespace Fortran::semantics
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index f2e460fc53a67..8c21fe18e67b4 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -715,7 +715,11 @@ void CodeGenAction::lowerHLFIRToFIR() {
   pm.enableVerifier(/*verifyPasses=*/true);
 
   // Create the pass pipeline
-  fir::createHLFIRToFIRPassPipeline(pm, level);
+  fir::createHLFIRToFIRPassPipeline(
+      pm,
+      ci.getInvocation().getFrontendOpts().features.IsEnabled(
+          Fortran::common::LanguageFeature::OpenMP),
+      level);
   (void)mlir::applyPassManagerCLOptions(pm);
 
   if (!mlir::succeeded(pm.run(*mlirModule))) {
@@ -828,6 +832,10 @@ void CodeGenAction::generateLLVMIR() {
     config.VScaleMax = vsr->second;
   }
 
+  if (ci.getInvocation().getFrontendOpts().features.IsEnabled(
+          Fortran::common::LanguageFeature::OpenMP))
+    config.EnableOpenMP = true;
+
   if (ci.getInvocation().getLoweringOpts().getNSWOnLoopVarInc())
     config.NSWOnLoopVarInc = true;
 
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 9f5b58590fb79..e84e7afbe82e0 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -2135,7 +2135,7 @@ class ElementalCallBuilder {
           hlfir::genLoopNest(loc, builder, shape, !mustBeOrdered);
       mlir::ValueRange oneBasedIndices = loopNest.oneBasedIndices;
       auto insPt = builder.saveInsertionPoint();
-      builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
+      builder.setInsertionPointToStart(loopNest.body);
       callContext.stmtCtx.pushScope();
       for (auto &preparedActual : loweredActuals)
         if (preparedActual)
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index f897022ef9512..eddc742d4c095 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -10,6 +10,7 @@
 
 #include "flang/Common/idioms.h"
 #include "flang/Evaluate/expression.h"
+#include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/symbol.h"
@@ -264,7 +265,7 @@ makeIteratorSpecifiers(const parser::OmpIteratorSpecifier &inp,
   return specifiers;
 }
 
-Iterator makeIterator(const parser::OmpIteratorModifier &inp,
+Iterator makeIterator(const parser::OmpIterator &inp,
                       semantics::SemanticsContext &semaCtx) {
   Iterator iterator;
   for (auto &&spec : inp.v)
@@ -324,8 +325,9 @@ makeProcedureDesignator(const parser::ProcedureDesignator &inp,
       inp.u)};
 }
 
-ReductionOperator makeReductionOperator(const parser::OmpReductionOperator &inp,
-                                        semantics::SemanticsContext &semaCtx) {
+ReductionOperator
+makeReductionOperator(const parser::OmpReductionIdentifier &inp,
+                      semantics::SemanticsContext &semaCtx) {
   return Fortran::common::visit(
       common::visitors{
           [&](const parser::DefinedOperator &s) {
@@ -340,9 +342,9 @@ ReductionOperator makeReductionOperator(const parser::OmpReductionOperator &inp,
 
 clause::DependenceType makeDepType(const parser::OmpDependenceType &inp) {
   switch (inp.v) {
-  case parser::OmpDependenceType::Type::Sink:
+  case parser::OmpDependenceType::Value::Sink:
     return clause::DependenceType::Sink;
-  case parser::OmpDependenceType::Type::Source:
+  case parser::OmpDependenceType::Value::Source:
     return clause::DependenceType::Source;
   }
   llvm_unreachable("Unexpected dependence type");
@@ -350,17 +352,17 @@ clause::DependenceType makeDepType(const parser::OmpDependenceType &inp) {
 
 clause::DependenceType makeDepType(const parser::OmpTaskDependenceType &inp) {
   switch (inp.v) {
-  case parser::OmpTaskDependenceType::Type::Depobj:
+  case parser::OmpTaskDependenceType::Value::Depobj:
     return clause::DependenceType::Depobj;
-  case parser::OmpTaskDependenceType::Type::In:
+  case parser::OmpTaskDependenceType::Value::In:
     return clause::DependenceType::In;
-  case parser::OmpTaskDependenceType::Type::Inout:
+  case parser::OmpTaskDependenceType::Value::Inout:
     return clause::DependenceType::Inout;
-  case parser::OmpTaskDependenceType::Type::Inoutset:
+  case parser::OmpTaskDependenceType::Value::Inoutset:
     return clause::DependenceType::Inoutset;
-  case parser::OmpTaskDependenceType::Type::Mutexinoutset:
+  case parser::OmpTaskDependenceType::Value::Mutexinoutset:
     return clause::DependenceType::Mutexinoutset;
-  case parser::OmpTaskDependenceType::Type::Out:
+  case parser::OmpTaskDependenceType::Value::Out:
     return clause::DependenceType::Out;
   }
   llvm_unreachable("Unexpected task dependence type");
@@ -381,7 +383,7 @@ Absent make(const parser::OmpClause::Absent &inp,
 Affinity make(const parser::OmpClause::Affinity &inp,
               semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpAffinityClause
-  auto &t0 = std::get<std::optional<parser::OmpIteratorModifier>>(inp.v.t);
+  auto &t0 = std::get<std::optional<parser::OmpIterator>>(inp.v.t);
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
 
   auto &&maybeIter =
@@ -626,7 +628,7 @@ Depend make(const parser::OmpClause::Depend &inp,
   using Variant = decltype(Depend::u);
 
   auto visitTaskDep = [&](const wrapped::TaskDep &s) -> Variant {
-    auto &t0 = std::get<std::optional<parser::OmpIteratorModifier>>(s.t);
+    auto &t0 = std::get<std::optional<parser::OmpIterator>>(s.t);
     auto &t1 = std::get<parser::OmpTaskDependenceType>(s.t);
     auto &t2 = std::get<parser::OmpObjectList>(s.t);
 
@@ -769,8 +771,7 @@ From make(const parser::OmpClause::From &inp,
   );
 
   auto &t0 = std::get<std::optional<std::list<wrapped::Expectation>>>(inp.v.t);
-  auto &t1 =
-      std::get<std::optional<std::list<parser::OmpIteratorModifier>>>(inp.v.t);
+  auto &t1 = std::get<std::optional<std::list<parser::OmpIterator>>>(inp.v.t);
   auto &t2 = std::get<parser::OmpObjectList>(inp.v.t);
 
   assert((!t0 || t0->size() == 1) && "Only one expectation modifier allowed");
@@ -881,7 +882,7 @@ Init make(const parser::OmpClause::Init &inp,
 InReduction make(const parser::OmpClause::InReduction &inp,
                  semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpInReductionClause
-  auto &t0 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t0 = std::get<parser::OmpReductionIdentifier>(inp.v.t);
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
   return InReduction{
       {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
@@ -920,7 +921,7 @@ Linear make(const parser::OmpClause::Linear &inp,
   using wrapped = parser::OmpLinearClause;
 
   CLAUSET_ENUM_CONVERT( //
-      convert, parser::OmpLinearModifier::Type, Linear::LinearModifier,
+      convert, parser::OmpLinearModifier::Value, Linear::LinearModifier,
       // clang-format off
       MS(Ref,  Ref)
       MS(Val,  Val)
@@ -984,10 +985,13 @@ Map make(const parser::OmpClause::Map &inp,
   );
 
   auto &t0 = std::get<std::optional<std::list<wrapped::TypeModifier>>>(inp.v.t);
-  auto &t1 =
-      std::get<std::optional<std::list<parser::OmpIteratorModifier>>>(inp.v.t);
+  auto &t1 = std::get<std::optional<std::list<parser::OmpIterator>>>(inp.v.t);
   auto &t2 = std::get<std::optional<std::list<wrapped::Type>>>(inp.v.t);
   auto &t3 = std::get<parser::OmpObjectList>(inp.v.t);
+  auto &t4 = std::get<parser::OmpMapperIdentifier>(inp.v.t);
+
+  if (t4.v)
+    TODO_NOLOC("OmpMapClause(MAPPER(...)): user defined mapper not supported");
 
   // These should have been diagnosed already.
   assert((!t1 || t1->size() == 1) && "Only one iterator modifier is allowed");
@@ -1188,7 +1192,7 @@ Reduction make(const parser::OmpClause::Reduction &inp,
   auto &t0 =
       std::get<std::optional<parser::OmpReductionClause::ReductionModifier>>(
           inp.v.t);
-  auto &t1 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t1 = std::get<parser::OmpReductionIdentifier>(inp.v.t);
   auto &t2 = std::get<parser::OmpObjectList>(inp.v.t);
   return Reduction{
       {/*ReductionModifier=*/t0
@@ -1315,7 +1319,7 @@ Permutation make(const parser::OmpClause::Permutation &inp,
 TaskReduction make(const parser::OmpClause::TaskReduction &inp,
                    semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpReductionClause
-  auto &t0 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t0 = std::get<parser::OmpReductionIdentifier>(inp.v.t);
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
   return TaskReduction{
       {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
@@ -1344,8 +1348,7 @@ To make(const parser::OmpClause::To &inp,
   );
 
   auto &t0 = std::get<std::optional<std::list<wrapped::Expectation>>>(inp.v.t);
-  auto &t1 =
-      std::get<std::optional<std::list<parser::OmpIteratorModifier>>>(inp.v.t);
+  auto &t1 = std::get<std::optional<std::list<parser::OmpIterator>>>(inp.v.t);
   auto &t2 = std::get<parser::OmpObjectList>(inp.v.t);
 
   assert((!t0 || t0->size() == 1) && "Only one expectation modifier allowed");
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 879a3e0ad7078..a2779213a1a15 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1372,6 +1372,15 @@ static void genTaskwaitClauses(lower::AbstractConverter &converter,
       loc, llvm::omp::Directive::OMPD_taskwait);
 }
 
+static void genWorkshareClauses(lower::AbstractConverter &converter,
+                                semantics::SemanticsContext &semaCtx,
+                                lower::StatementContext &stmtCtx,
+                                const List<Clause> &clauses, mlir::Location loc,
+                                mlir::omp::WorkshareOperands &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processNowait(clauseOps);
+}
+
 static void genTeamsClauses(lower::AbstractConverter &converter,
                             semantics::SemanticsContext &semaCtx,
                             lower::StatementContext &stmtCtx,
@@ -2033,6 +2042,24 @@ genTaskyieldOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   return converter.getFirOpBuilder().create<mlir::omp::TaskyieldOp>(loc);
 }
 
+static mlir::omp::WorkshareOp
+genWorkshareOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
+               semantics::SemanticsContext &semaCtx,
+               lower::pft::Evaluation &eval, mlir::Location loc,
+               const ConstructQueue &queue,
+               ConstructQueue::const_iterator item) {
+  lower::StatementContext stmtCtx;
+  mlir::omp::WorkshareOperands clauseOps;
+  genWorkshareClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+                      clauseOps);
+
+  return genOpWithBody<mlir::omp::WorkshareOp>(
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_workshare)
+          .setClauses(&item->clauses),
+      queue, item, clauseOps);
+}
+
 static mlir::omp::TeamsOp
 genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
@@ -2631,10 +2658,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
                   llvm::omp::getOpenMPDirectiveName(dir) + ")");
   // case llvm::omp::Directive::OMPD_workdistribute:
   case llvm::omp::Directive::OMPD_workshare:
-    // FIXME: Workshare is not a commonly used OpenMP construct, an
-    // implementation for this feature will come later. For the codes
-    // that use this construct, add a single construct for now.
-    genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item);
+    genWorkshareOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   default:
     // Combined and composite constructs should have been split into a sequence
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 6b98ea3d0615b..736de2ee511be 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -374,7 +374,7 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
   // know this won't miss any opportuinties for clever elemental inlining
   hlfir::LoopNest nest = hlfir::genLoopNest(
       loc, builder, shapeShift.getExtents(), /*isUnordered=*/true);
-  builder.setInsertionPointToStart(nest.innerLoop.getBody());
+  builder.setInsertionPointToStart(nest.body);
   mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy());
   auto lhsEleAddr = builder.create<fir::ArrayCoorOp>(
       loc, refTy, lhs, shapeShift, /*slice=*/mlir::Value{},
@@ -388,7 +388,7 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
       builder, loc, redId, refTy, lhsEle, rhsEle);
   builder.create<fir::StoreOp>(loc, scalarReduction, lhsEleAddr);
 
-  builder.setInsertionPointAfter(nest.outerLoop);
+  builder.setInsertionPointAfter(nest.outerOp);
   builder.create<mlir::omp::YieldOp>(loc, lhsAddr);
 }
 
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 8d0ae2f195178..7425ccf7fc0e3 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -20,6 +20,7 @@
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include <mlir/Dialect/OpenMP/OpenMPDialect.h>
 #include <optional>
 
 // Return explicit extents. If the base is a fir.box, this won't read it to
@@ -855,26 +856,50 @@ mlir::Value hlfir::inlineElementalOp(
 
 hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
                                    fir::FirOpBuilder &builder,
-                                   mlir::ValueRange extents, bool isUnordered) {
+                                   mlir::ValueRange extents, bool isUnordered,
+                                   bool emitWorkshareLoop) {
+  emitWorkshareLoop = emitWorkshareLoop && isUnordered;
   hlfir::LoopNest loopNest;
   assert(!extents.empty() && "must have at least one extent");
-  auto insPt = builder.saveInsertionPoint();
+  mlir::OpBuilder::InsertionGuard guard(builder);
   loopNest.oneBasedIndices.assign(extents.size(), mlir::Value{});
   // Build loop nest from column to row.
   auto one = builder.create<mlir::arith::ConstantIndexOp>(loc, 1);
   mlir::Type indexType = builder.getIndexType();
-  unsigned dim = extents.size() - 1;
-  for (auto extent : llvm::reverse(extents)) {
-    auto ub = builder.createConvert(loc, indexType, extent);
-    loopNest.innerLoop =
-        builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered);
-    builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
-    // Reverse the indices so they are in column-major order.
-    loopNest.oneBasedIndices[dim--] = loopNest.innerLoop.getInductionVar();
-    if (!loopNest.outerLoop)
-      loopNest.outerLoop = loopNest.innerLoop;
+  if (emitWorkshareLoop) {
+    auto wslw = builder.create<mlir::omp::WorkshareLoopWrapperOp>(loc);
+    loopNest.outerOp = wslw;
+    builder.createBlock(&wslw.getRegion());
+    mlir::omp::LoopNestOperands lnops;
+    lnops.loopInclusive = builder.getUnitAttr();
+    for (auto extent : llvm::reverse(extents)) {
+      lnops.loopLowerBounds.push_back(one);
+      lnops.loopUpperBounds.push_back(extent);
+      lnops.loopSteps.push_back(one);
+    }
+    auto lnOp = builder.create<mlir::omp::LoopNestOp>(loc, lnops);
+    mlir::Block *block = builder.createBlock(&lnOp.getRegion());
+    for (auto extent : llvm::reverse(extents))
+      block->addArgument(extent.getType(), extent.getLoc());
+    loopNest.body = block;
+    builder.create<mlir::omp::YieldOp>(loc);
+    for (unsigned dim = 0; dim < extents.size(); dim++)
+      loopNest.oneBasedIndices[extents.size() - dim - 1] =
+          lnOp.getRegion().front().getArgument(dim);
+  } else {
+    unsigned dim = extents.size() - 1;
+    for (auto extent : llvm::reverse(extents)) {
+      auto ub = builder.createConvert(loc, indexType, extent);
+      auto doLoop =
+          builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered);
+      loopNest.body = doLoop.getBody();
+      builder.setInsertionPointToStart(loopNest.body);
+      // Reverse the indices so they are in column-major order.
+      loopNest.oneBasedIndices[dim--] = doLoop.getInductionVar();
+      if (!loopNest.outerOp)
+        loopNest.outerOp = doLoop;
+    }
   }
-  builder.restoreInsertionPoint(insPt);
   return loopNest;
 }
 
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 5f746bf80e9d5..9ec055b1aecab 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -1127,6 +1127,30 @@ struct TargetLoongArch64 : public GenericTarget<TargetLoongArch64> {
     }
     return marshal;
   }
+
+  CodeGenSpecifics::Marshalling
+  integerArgumentType(mlir::Location loc,
+                      mlir::IntegerType argTy) const override {
+    if (argTy.getWidth() == 32) {
+      // LA64 LP64D ABI requires unsigned 32 bit integers to be sign extended.
+      // Therefore, Flang also follows it if a function needs to be
+      // interoperable with C.
+      //
+      // Currently, it only adds `signext` attribute to the dummy arguments and
+      // return values in the function signatures, but it does not add the
+      // corresponding attribute to the actual arguments and return values in
+      // `fir.call` instruction. Thanks to LLVM's integration of all these
+      // attributes, the modification is still effective.
+      CodeGenSpecifics::Marshalling marshal;
+      AT::IntegerExtension intExt = AT::IntegerExtension::Sign;
+      marshal.emplace_back(argTy, AT{/*alignment=*/0, /*byval=*/false,
+                                     /*sret=*/false, /*append=*/false,
+                                     /*intExt=*/intExt});
+      return marshal;
+    }
+
+    return GenericTarget::integerArgumentType(loc, argTy);
+  }
 };
 } // namespace
 
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index a70a6b388c4b1..07794828fce26 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -26,6 +26,7 @@
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -793,7 +794,7 @@ struct ElementalOpConversion
     hlfir::LoopNest loopNest =
         hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered());
     auto insPt = builder.saveInsertionPoint();
-    builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
+    builder.setInsertionPointToStart(loopNest.body);
     auto yield = hlfir::inlineElementalOp(loc, builder, elemental,
                                           loopNest.oneBasedIndices);
     hlfir::Entity elementValue(yield.getElementValue());
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 85dd517cb5791..424566462e8fe 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -464,7 +464,7 @@ void OrderedAssignmentRewriter::pre(hlfir::RegionAssignOp regionAssignOp) {
       // if the LHS is not).
       mlir::Value shape = hlfir::genShape(loc, builder, lhsEntity);
       elementalLoopNest = hlfir::genLoopNest(loc, builder, shape);
-      builder.setInsertionPointToStart(elementalLoopNest->innerLoop.getBody());
+      builder.setInsertionPointToStart(elementalLoopNest->body);
       lhsEntity = hlfir::getElementAt(loc, builder, lhsEntity,
                                       elementalLoopNest->oneBasedIndices);
       rhsEntity = hlfir::getElementAt(loc, builder, rhsEntity,
@@ -484,7 +484,7 @@ void OrderedAssignmentRewriter::pre(hlfir::RegionAssignOp regionAssignOp) {
     for (auto &cleanupConversion : argConversionCleanups)
       cleanupConversion();
     if (elementalLoopNest)
-      builder.setInsertionPointAfter(elementalLoopNest->outerLoop);
+      builder.setInsertionPointAfter(elementalLoopNest->outerOp);
   } else {
     // TODO: preserve allocatable assignment aspects for forall once
     // they are conveyed in hlfir.region_assign.
@@ -492,8 +492,7 @@ void OrderedAssignmentRewriter::pre(hlfir::RegionAssignOp regionAssignOp) {
   }
   generateCleanupIfAny(loweredLhs.elementalCleanup);
   if (loweredLhs.vectorSubscriptLoopNest)
-    builder.setInsertionPointAfter(
-        loweredLhs.vectorSubscriptLoopNest->outerLoop);
+    builder.setInsertionPointAfter(loweredLhs.vectorSubscriptLoopNest->outerOp);
   generateCleanupIfAny(oldRhsYield);
   generateCleanupIfAny(loweredLhs.nonElementalCleanup);
 }
@@ -518,8 +517,8 @@ void OrderedAssignmentRewriter::pre(hlfir::WhereOp whereOp) {
       hlfir::Entity savedMask{maybeSaved->first};
       mlir::Value shape = hlfir::genShape(loc, builder, savedMask);
       whereLoopNest = hlfir::genLoopNest(loc, builder, shape);
-      constructStack.push_back(whereLoopNest->outerLoop.getOperation());
-      builder.setInsertionPointToStart(whereLoopNest->innerLoop.getBody());
+      constructStack.push_back(whereLoopNest->outerOp);
+      builder.setInsertionPointToStart(whereLoopNest->body);
       mlir::Value cdt = hlfir::getElementAt(loc, builder, savedMask,
                                             whereLoopNest->oneBasedIndices);
       generateMaskIfOp(cdt);
@@ -527,7 +526,7 @@ void OrderedAssignmentRewriter::pre(hlfir::WhereOp whereOp) {
         // If this is the same run as the one that saved the value, the clean-up
         // was left-over to be done now.
         auto insertionPoint = builder.saveInsertionPoint();
-        builder.setInsertionPointAfter(whereLoopNest->outerLoop);
+        builder.setInsertionPointAfter(whereLoopNest->outerOp);
         generateCleanupIfAny(maybeSaved->second);
         builder.restoreInsertionPoint(insertionPoint);
       }
@@ -539,8 +538,8 @@ void OrderedAssignmentRewriter::pre(hlfir::WhereOp whereOp) {
     mask.generateNoneElementalPart(builder, mapper);
     mlir::Value shape = mask.generateShape(builder, mapper);
     whereLoopNest = hlfir::genLoopNest(loc, builder, shape);
-    constructStack.push_back(whereLoopNest->outerLoop.getOperation());
-    builder.setInsertionPointToStart(whereLoopNest->innerLoop.getBody());
+    constructStack.push_back(whereLoopNest->outerOp);
+    builder.setInsertionPointToStart(whereLoopNest->body);
     mlir::Value cdt = generateMaskedEntity(mask);
     generateMaskIfOp(cdt);
     return;
@@ -754,7 +753,7 @@ OrderedAssignmentRewriter::generateYieldedLHS(
       loweredLhs.vectorSubscriptLoopNest = hlfir::genLoopNest(
           loc, builder, loweredLhs.vectorSubscriptShape.value());
       builder.setInsertionPointToStart(
-          loweredLhs.vectorSubscriptLoopNest->innerLoop.getBody());
+          loweredLhs.vectorSubscriptLoopNest->body);
     }
     loweredLhs.lhs = temp->second.fetch(loc, builder);
     return loweredLhs;
@@ -771,8 +770,7 @@ OrderedAssignmentRewriter::generateYieldedLHS(
     loweredLhs.vectorSubscriptLoopNest =
         hlfir::genLoopNest(loc, builder, *loweredLhs.vectorSubscriptShape,
                            !elementalAddrLhs.isOrdered());
-    builder.setInsertionPointToStart(
-        loweredLhs.vectorSubscriptLoopNest->innerLoop.getBody());
+    builder.setInsertionPointToStart(loweredLhs.vectorSubscriptLoopNest->body);
     mapper.map(elementalAddrLhs.getIndices(),
                loweredLhs.vectorSubscriptLoopNest->oneBasedIndices);
     for (auto &op : elementalAddrLhs.getBody().front().without_terminator())
@@ -798,11 +796,11 @@ OrderedAssignmentRewriter::generateMaskedEntity(MaskedArrayExpr &maskedExpr) {
   if (!maskedExpr.noneElementalPartWasGenerated) {
     // Generate none elemental part before the where loops (but inside the
     // current forall loops if any).
-    builder.setInsertionPoint(whereLoopNest->outerLoop);
+    builder.setInsertionPoint(whereLoopNest->outerOp);
     maskedExpr.generateNoneElementalPart(builder, mapper);
   }
   // Generate the none elemental part cleanup after the where loops.
-  builder.setInsertionPointAfter(whereLoopNest->outerLoop);
+  builder.setInsertionPointAfter(whereLoopNest->outerOp);
   maskedExpr.generateNoneElementalCleanupIfAny(builder, mapper);
   // Generate the value of the current element for the masked expression
   // at the current insertion point (inside the where loops, and any fir.if
@@ -1242,7 +1240,7 @@ void OrderedAssignmentRewriter::saveLeftHandSide(
   LhsValueAndCleanUp loweredLhs = generateYieldedLHS(loc, region);
   fir::factory::TemporaryStorage *temp = nullptr;
   if (loweredLhs.vectorSubscriptLoopNest)
-    constructStack.push_back(loweredLhs.vectorSubscriptLoopNest->outerLoop);
+    constructStack.push_back(loweredLhs.vectorSubscriptLoopNest->outerOp);
   if (loweredLhs.vectorSubscriptLoopNest && !rhsIsArray(regionAssignOp)) {
     // Vector subscripted entity for which the shape must also be saved on top
     // of the element addresses (e.g. the shape may change in each forall
@@ -1265,7 +1263,7 @@ void OrderedAssignmentRewriter::saveLeftHandSide(
     // subscripted LHS.
     auto &vectorTmp = temp->cast<fir::factory::AnyVectorSubscriptStack>();
     auto insertionPoint = builder.saveInsertionPoint();
-    builder.setInsertionPoint(loweredLhs.vectorSubscriptLoopNest->outerLoop);
+    builder.setInsertionPoint(loweredLhs.vectorSubscriptLoopNest->outerOp);
     vectorTmp.pushShape(loc, builder, shape);
     builder.restoreInsertionPoint(insertionPoint);
   } else {
@@ -1290,8 +1288,7 @@ void OrderedAssignmentRewriter::saveLeftHandSide(
   generateCleanupIfAny(loweredLhs.elementalCleanup);
   if (loweredLhs.vectorSubscriptLoopNest) {
     constructStack.pop_back();
-    builder.setInsertionPointAfter(
-        loweredLhs.vectorSubscriptLoopNest->outerLoop);
+    builder.setInsertionPointAfter(loweredLhs.vectorSubscriptLoopNest->outerOp);
   }
 }
 
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index d05a3258cf293..166649d955dab 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -483,7 +483,7 @@ llvm::LogicalResult ElementalAssignBufferization::matchAndRewrite(
   // hlfir.elemental region inside the inner loop
   hlfir::LoopNest loopNest =
       hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered());
-  builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
+  builder.setInsertionPointToStart(loopNest.body);
   auto yield = hlfir::inlineElementalOp(loc, builder, elemental,
                                         loopNest.oneBasedIndices);
   hlfir::Entity elementValue{yield.getElementValue()};
@@ -554,7 +554,7 @@ llvm::LogicalResult BroadcastAssignBufferization::matchAndRewrite(
       hlfir::getIndexExtents(loc, builder, shape);
   hlfir::LoopNest loopNest =
       hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true);
-  builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
+  builder.setInsertionPointToStart(loopNest.body);
   auto arrayElement =
       hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
   builder.create<hlfir::AssignOp>(loc, rhs, arrayElement);
@@ -652,7 +652,7 @@ llvm::LogicalResult VariableAssignBufferization::matchAndRewrite(
       hlfir::getIndexExtents(loc, builder, shape);
   hlfir::LoopNest loopNest =
       hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true);
-  builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
+  builder.setInsertionPointToStart(loopNest.body);
   auto rhsArrayElement =
       hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
   rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 035d0d5ca46c7..b1e0dbf6e707e 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -5,6 +5,7 @@ add_flang_library(FlangOpenMPTransforms
   MapsForPrivatizedSymbols.cpp
   MapInfoFinalization.cpp
   MarkDeclareTarget.cpp
+  LowerWorkshare.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
new file mode 100644
index 0000000000000..225c585a02d91
--- /dev/null
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -0,0 +1,527 @@
+//===- LowerWorkshare.cpp - special cases for bufferization -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering of omp.workshare to other omp constructs.
+//
+// This pass is tasked with parallelizing the loops nested in
+// workshare.loop_wrapper while both the Fortran to mlir lowering and the hlfir
+// to fir lowering pipelines are responsible for emitting the
+// workshare.loop_wrapper ops where appropriate according to the
+// `shouldUseWorkshareLowering` function.
+//
+//===----------------------------------------------------------------------===//
+
+#include <flang/Optimizer/Builder/FIRBuilder.h>
+#include <flang/Optimizer/Dialect/FIROps.h>
+#include <flang/Optimizer/Dialect/FIRType.h>
+#include <flang/Optimizer/HLFIR/HLFIROps.h>
+#include <flang/Optimizer/OpenMP/Passes.h>
+#include <llvm/ADT/BreadthFirstIterator.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVectorExtras.h>
+#include <llvm/ADT/iterator_range.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
+#include <mlir/Dialect/OpenMP/OpenMPClauseOperands.h>
+#include <mlir/Dialect/OpenMP/OpenMPDialect.h>
+#include <mlir/Dialect/SCF/IR/SCF.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/IRMapping.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/Value.h>
+#include <mlir/IR/Visitors.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include <mlir/Support/LLVM.h>
+
+#include <variant>
+
+namespace flangomp {
+#define GEN_PASS_DEF_LOWERWORKSHARE
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+#define DEBUG_TYPE "lower-workshare"
+
+using namespace mlir;
+
+namespace flangomp {
+
+// Checks for nesting pattern below as we need to avoid sharing the work of
+// statements which are nested in some constructs such as omp.critical or
+// another omp.parallel.
+//
+// omp.workshare { // `wsOp`
+//   ...
+//     omp.T { // `parent`
+//       ...
+//         `op`
+//
+template <typename T>
+static bool isNestedIn(omp::WorkshareOp wsOp, Operation *op) {
+  T parent = op->getParentOfType<T>();
+  if (!parent)
+    return false;
+  return wsOp->isProperAncestor(parent);
+}
+
+bool shouldUseWorkshareLowering(Operation *op) {
+  auto parentWorkshare = op->getParentOfType<omp::WorkshareOp>();
+
+  if (!parentWorkshare)
+    return false;
+
+  if (isNestedIn<omp::CriticalOp>(parentWorkshare, op))
+    return false;
+
+  // 2.8.3  workshare Construct
+  // For a parallel construct, the construct is a unit of work with respect to
+  // the workshare construct. The statements contained in the parallel construct
+  // are executed by a new thread team.
+  if (isNestedIn<omp::ParallelOp>(parentWorkshare, op))
+    return false;
+
+  // 2.8.2  single Construct
+  // Binding The binding thread set for a single region is the current team. A
+  // single region binds to the innermost enclosing parallel region.
+  // Description Only one of the encountering threads will execute the
+  // structured block associated with the single construct.
+  if (isNestedIn<omp::SingleOp>(parentWorkshare, op))
+    return false;
+
+  // Do not use workshare lowering until we support CFG in omp.workshare
+  if (parentWorkshare.getRegion().getBlocks().size() != 1)
+    return false;
+
+  return true;
+}
+
+} // namespace flangomp
+
+namespace {
+
+struct SingleRegion {
+  Block::iterator begin, end;
+};
+
+static bool mustParallelizeOp(Operation *op) {
+  return op
+      ->walk([&](Operation *nested) {
+        // We need to be careful not to pick up workshare.loop_wrapper in nested
+        // omp.parallel{omp.workshare} regions, i.e. make sure that `nested`
+        // binds to the workshare region we are currently handling.
+        //
+        // For example:
+        //
+        // omp.parallel {
+        //   omp.workshare { // currently handling this
+        //     omp.parallel {
+        //       omp.workshare { // nested workshare
+        //         omp.workshare.loop_wrapper {}
+        //
+        // Therefore, we skip if we encounter a nested omp.workshare.
+        if (isa<omp::WorkshareOp>(op))
+          return WalkResult::skip();
+        if (isa<omp::WorkshareLoopWrapperOp>(op))
+          return WalkResult::interrupt();
+        return WalkResult::advance();
+      })
+      .wasInterrupted();
+}
+
+static bool isSafeToParallelize(Operation *op) {
+  return isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
+         isMemoryEffectFree(op);
+}
+
+/// Simple shallow copies suffice for our purposes in this pass, so we implement
+/// this simpler alternative to the full fledged `createCopyFunc` in the
+/// frontend
+static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType,
+                                         fir::FirOpBuilder builder) {
+  mlir::ModuleOp module = builder.getModule();
+  auto rt = cast<fir::ReferenceType>(varType);
+  mlir::Type eleTy = rt.getEleTy();
+  std::string copyFuncName =
+      fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy");
+
+  if (auto decl = module.lookupSymbol<mlir::func::FuncOp>(copyFuncName))
+    return decl;
+  // create function
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  mlir::OpBuilder modBuilder(module.getBodyRegion());
+  llvm::SmallVector<mlir::Type> argsTy = {varType, varType};
+  auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {});
+  mlir::func::FuncOp funcOp =
+      modBuilder.create<mlir::func::FuncOp>(loc, copyFuncName, funcType);
+  funcOp.setVisibility(mlir::SymbolTable::Visibility::Private);
+  builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy,
+                      {loc, loc});
+  builder.setInsertionPointToStart(&funcOp.getRegion().back());
+
+  Value loaded = builder.create<fir::LoadOp>(loc, funcOp.getArgument(1));
+  builder.create<fir::StoreOp>(loc, loaded, funcOp.getArgument(0));
+
+  builder.create<mlir::func::ReturnOp>(loc);
+  return funcOp;
+}
+
+static bool isUserOutsideSR(Operation *user, Operation *parentOp,
+                            SingleRegion sr) {
+  while (user->getParentOp() != parentOp)
+    user = user->getParentOp();
+  return sr.begin->getBlock() != user->getBlock() ||
+         !(user->isBeforeInBlock(&*sr.end) && sr.begin->isBeforeInBlock(user));
+}
+
+static bool isTransitivelyUsedOutside(Value v, SingleRegion sr) {
+  Block *srBlock = sr.begin->getBlock();
+  Operation *parentOp = srBlock->getParentOp();
+
+  for (auto &use : v.getUses()) {
+    Operation *user = use.getOwner();
+    if (isUserOutsideSR(user, parentOp, sr))
+      return true;
+
+    // Now we know user is inside `sr`.
+
+    // Results of nested users cannot be used outside of `sr`.
+    if (user->getBlock() != srBlock)
+      continue;
+
+    // A non-safe to parallelize operation will be checked for uses outside
+    // separately.
+    if (!isSafeToParallelize(user))
+      continue;
+
+    // For safe to parallelize operations, we need to check if there is a
+    // transitive use of `v` through them.
+    for (auto res : user->getResults())
+      if (isTransitivelyUsedOutside(res, sr))
+        return true;
+  }
+  return false;
+}
+
+/// We clone pure operations in both the parallel and single blocks. this
+/// functions cleans them up if they end up with no uses
+static void cleanupBlock(Block *block) {
+  for (Operation &op : llvm::make_early_inc_range(
+           llvm::make_range(block->rbegin(), block->rend())))
+    if (isOpTriviallyDead(&op))
+      op.erase();
+}
+
+static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
+                              IRMapping &rootMapping, Location loc,
+                              mlir::DominanceInfo &di) {
+  OpBuilder rootBuilder(sourceRegion.getContext());
+  ModuleOp m = sourceRegion.getParentOfType<ModuleOp>();
+  OpBuilder copyFuncBuilder(m.getBodyRegion());
+  fir::FirOpBuilder firCopyFuncBuilder(copyFuncBuilder, m);
+
+  auto mapReloadedValue =
+      [&](Value v, OpBuilder allocaBuilder, OpBuilder singleBuilder,
+          OpBuilder parallelBuilder, IRMapping singleMapping) -> Value {
+    if (auto reloaded = rootMapping.lookupOrNull(v))
+      return nullptr;
+    Type ty = v.getType();
+    Value alloc = allocaBuilder.create<fir::AllocaOp>(loc, ty);
+    singleBuilder.create<fir::StoreOp>(loc, singleMapping.lookup(v), alloc);
+    Value reloaded = parallelBuilder.create<fir::LoadOp>(loc, ty, alloc);
+    rootMapping.map(v, reloaded);
+    return alloc;
+  };
+
+  auto moveToSingle =
+      [&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder,
+          OpBuilder parallelBuilder) -> std::pair<bool, SmallVector<Value>> {
+    IRMapping singleMapping = rootMapping;
+    SmallVector<Value> copyPrivate;
+    bool allParallelized = true;
+
+    for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
+      if (isSafeToParallelize(&op)) {
+        singleBuilder.clone(op, singleMapping);
+        if (llvm::all_of(op.getOperands(), [&](Value opr) {
+              // Either we have already remapped it
+              bool remapped = rootMapping.contains(opr);
+              // Or it is available because it dominates `sr`
+              bool dominates =
+                  di.properlyDominates(opr.getDefiningOp(), &*sr.begin);
+              return remapped || dominates;
+            })) {
+          // Safe to parallelize operations which have all operands available in
+          // the root parallel block can be executed there.
+          parallelBuilder.clone(op, rootMapping);
+        } else {
+          // If any operand was not available, it means that there was no
+          // transitive use of a non-safe-to-parallelize operation outside `sr`.
+          // This means that there should be no transitive uses outside `sr` of
+          // `op`.
+          assert(llvm::all_of(op.getResults(), [&](Value v) {
+            return !isTransitivelyUsedOutside(v, sr);
+          }));
+          allParallelized = false;
+        }
+      } else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
+        auto hoisted =
+            cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
+        rootMapping.map(&*alloca, &*hoisted);
+        rootMapping.map(alloca.getResult(), hoisted.getResult());
+        copyPrivate.push_back(hoisted);
+        allParallelized = false;
+      } else {
+        singleBuilder.clone(op, singleMapping);
+        // Prepare reloaded values for results of operations that cannot be
+        // safely parallelized and which are used after the region `sr`.
+        for (auto res : op.getResults()) {
+          if (isTransitivelyUsedOutside(res, sr)) {
+            auto alloc = mapReloadedValue(res, allocaBuilder, singleBuilder,
+                                          parallelBuilder, singleMapping);
+            if (alloc)
+              copyPrivate.push_back(alloc);
+          }
+        }
+        allParallelized = false;
+      }
+    }
+    singleBuilder.create<omp::TerminatorOp>(loc);
+    return {allParallelized, copyPrivate};
+  };
+
+  for (Block &block : sourceRegion) {
+    Block *targetBlock = rootBuilder.createBlock(
+        &targetRegion, {}, block.getArgumentTypes(),
+        llvm::map_to_vector(block.getArguments(),
+                            [](BlockArgument arg) { return arg.getLoc(); }));
+    rootMapping.map(&block, targetBlock);
+    rootMapping.map(block.getArguments(), targetBlock->getArguments());
+  }
+
+  auto handleOneBlock = [&](Block &block) {
+    Block &targetBlock = *rootMapping.lookup(&block);
+    rootBuilder.setInsertionPointToStart(&targetBlock);
+    Operation *terminator = block.getTerminator();
+    SmallVector<std::variant<SingleRegion, Operation *>> regions;
+
+    auto it = block.begin();
+    auto getOneRegion = [&]() {
+      if (&*it == terminator)
+        return false;
+      if (mustParallelizeOp(&*it)) {
+        regions.push_back(&*it);
+        it++;
+        return true;
+      }
+      SingleRegion sr;
+      sr.begin = it;
+      while (&*it != terminator && !mustParallelizeOp(&*it))
+        it++;
+      sr.end = it;
+      assert(sr.begin != sr.end);
+      regions.push_back(sr);
+      return true;
+    };
+    while (getOneRegion())
+      ;
+
+    for (auto [i, opOrSingle] : llvm::enumerate(regions)) {
+      bool isLast = i + 1 == regions.size();
+      if (std::holds_alternative<SingleRegion>(opOrSingle)) {
+        OpBuilder singleBuilder(sourceRegion.getContext());
+        Block *singleBlock = new Block();
+        singleBuilder.setInsertionPointToStart(singleBlock);
+
+        OpBuilder allocaBuilder(sourceRegion.getContext());
+        Block *allocaBlock = new Block();
+        allocaBuilder.setInsertionPointToStart(allocaBlock);
+
+        OpBuilder parallelBuilder(sourceRegion.getContext());
+        Block *parallelBlock = new Block();
+        parallelBuilder.setInsertionPointToStart(parallelBlock);
+
+        auto [allParallelized, copyprivateVars] =
+            moveToSingle(std::get<SingleRegion>(opOrSingle), allocaBuilder,
+                         singleBuilder, parallelBuilder);
+        if (allParallelized) {
+          // The single region was not required as all operations were safe to
+          // parallelize
+          assert(copyprivateVars.empty());
+          assert(allocaBlock->empty());
+          delete singleBlock;
+        } else {
+          omp::SingleOperands singleOperands;
+          if (isLast)
+            singleOperands.nowait = rootBuilder.getUnitAttr();
+          singleOperands.copyprivateVars = copyprivateVars;
+          cleanupBlock(singleBlock);
+          for (auto var : singleOperands.copyprivateVars) {
+            mlir::func::FuncOp funcOp =
+                createCopyFunc(loc, var.getType(), firCopyFuncBuilder);
+            singleOperands.copyprivateSyms.push_back(
+                SymbolRefAttr::get(funcOp));
+          }
+          omp::SingleOp singleOp =
+              rootBuilder.create<omp::SingleOp>(loc, singleOperands);
+          singleOp.getRegion().push_back(singleBlock);
+          targetRegion.front().getOperations().splice(
+              singleOp->getIterator(), allocaBlock->getOperations());
+        }
+        rootBuilder.getInsertionBlock()->getOperations().splice(
+            rootBuilder.getInsertionPoint(), parallelBlock->getOperations());
+        delete allocaBlock;
+        delete parallelBlock;
+      } else {
+        auto op = std::get<Operation *>(opOrSingle);
+        if (auto wslw = dyn_cast<omp::WorkshareLoopWrapperOp>(op)) {
+          omp::WsloopOperands wsloopOperands;
+          if (isLast)
+            wsloopOperands.nowait = rootBuilder.getUnitAttr();
+          auto wsloop =
+              rootBuilder.create<mlir::omp::WsloopOp>(loc, wsloopOperands);
+          auto clonedWslw = cast<omp::WorkshareLoopWrapperOp>(
+              rootBuilder.clone(*wslw, rootMapping));
+          wsloop.getRegion().takeBody(clonedWslw.getRegion());
+          clonedWslw->erase();
+        } else {
+          assert(mustParallelizeOp(op));
+          Operation *cloned = rootBuilder.cloneWithoutRegions(*op, rootMapping);
+          for (auto [region, clonedRegion] :
+               llvm::zip(op->getRegions(), cloned->getRegions()))
+            parallelizeRegion(region, clonedRegion, rootMapping, loc, di);
+        }
+      }
+    }
+
+    rootBuilder.clone(*block.getTerminator(), rootMapping);
+  };
+
+  if (sourceRegion.hasOneBlock()) {
+    handleOneBlock(sourceRegion.front());
+  } else {
+    auto &domTree = di.getDomTree(&sourceRegion);
+    for (auto node : llvm::breadth_first(domTree.getRootNode())) {
+      handleOneBlock(*node->getBlock());
+    }
+  }
+
+  for (Block &targetBlock : targetRegion)
+    cleanupBlock(&targetBlock);
+}
+
+/// Lowers workshare to a sequence of single-thread regions and parallel loops
+///
+/// For example:
+///
+/// omp.workshare {
+///   %a = fir.allocmem
+///   omp.workshare.loop_wrapper {}
+///   fir.call Assign %b %a
+///   fir.freemem %a
+/// }
+///
+/// becomes
+///
+/// %tmp = fir.alloca
+/// omp.single copyprivate(%tmp) {
+///   %a = fir.allocmem
+///   fir.store %a %tmp
+/// }
+/// %a_reloaded = fir.load %tmp
+/// omp.workshare.loop_wrapper {}
+/// omp.single {
+///   fir.call Assign %b %a_reloaded
+///   fir.freemem %a_reloaded
+/// }
+///
+/// Note that we allocate temporary memory for values in omp.single's which need
+/// to be accessed by all threads and broadcast them using single's copyprivate
+LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, DominanceInfo &di) {
+  Location loc = wsOp->getLoc();
+  IRMapping rootMapping;
+
+  OpBuilder rootBuilder(wsOp);
+
+  // FIXME Currently, we only support workshare constructs with structured
+  // control flow. The transformation itself supports CFG, however, once we
+  // transform the MLIR region in the omp.workshare, we need to inline that
+  // region in the parent block. We have no guarantees at this point of the
+  // pipeline that the parent op supports CFG (e.g. fir.if), thus this is not
+  // generally possible.  The alternative is to put the lowered region in an
+  // operation akin to scf.execute_region, which will get lowered at the same
+  // time when fir ops get lowered to CFG. However, SCF is not registered in
+  // flang so we cannot use it. Remove this requirement once we have
+  // scf.execute_region or an alternative operation available.
+  if (wsOp.getRegion().getBlocks().size() == 1) {
+    // This operation is just a placeholder which will be erased later. We need
+    // it because our `parallelizeRegion` function works on regions and not
+    // blocks.
+    omp::WorkshareOp newOp =
+        rootBuilder.create<omp::WorkshareOp>(loc, omp::WorkshareOperands());
+    if (!wsOp.getNowait())
+      rootBuilder.create<omp::BarrierOp>(loc);
+
+    parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc,
+                      di);
+
+    // Inline the contents of the placeholder workshare op into its parent
+    // block.
+    Block *theBlock = &newOp.getRegion().front();
+    Operation *term = theBlock->getTerminator();
+    Block *parentBlock = wsOp->getBlock();
+    parentBlock->getOperations().splice(newOp->getIterator(),
+                                        theBlock->getOperations());
+    assert(term->getNumOperands() == 0);
+    term->erase();
+    newOp->erase();
+    wsOp->erase();
+  } else {
+    // Otherwise just change the operation to an omp.single.
+
+    wsOp->emitWarning(
+        "omp workshare with unstructured control flow is currently "
+        "unsupported and will be serialized.");
+
+    // `shouldUseWorkshareLowering` should have guaranteed that there are no
+    // omp.workshare_loop_wrapper's that bind to this omp.workshare.
+    assert(!wsOp->walk([&](Operation *op) {
+                  // Nested omp.workshare can have their own
+                  // omp.workshare_loop_wrapper's.
+                  if (isa<omp::WorkshareOp>(op))
+                    return WalkResult::skip();
+                  if (isa<omp::WorkshareLoopWrapperOp>(op))
+                    return WalkResult::interrupt();
+                  return WalkResult::advance();
+                })
+                .wasInterrupted());
+
+    omp::SingleOperands operands;
+    operands.nowait = wsOp.getNowaitAttr();
+    omp::SingleOp newOp = rootBuilder.create<omp::SingleOp>(loc, operands);
+
+    newOp.getRegion().getBlocks().splice(newOp.getRegion().getBlocks().begin(),
+                                         wsOp.getRegion().getBlocks());
+    wsOp->erase();
+  }
+  return success();
+}
+
+class LowerWorksharePass
+    : public flangomp::impl::LowerWorkshareBase<LowerWorksharePass> {
+public:
+  void runOnOperation() override {
+    mlir::DominanceInfo &di = getAnalysis<mlir::DominanceInfo>();
+    getOperation()->walk([&](mlir::omp::WorkshareOp wsOp) {
+      if (failed(lowerWorkshare(wsOp, di)))
+        signalPassFailure();
+    });
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index a914407991591..31af3531641dd 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -212,7 +212,7 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
-void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
+void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
                                   llvm::OptimizationLevel optLevel) {
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
@@ -230,6 +230,8 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
   pm.addPass(hlfir::createBufferizeHLFIR());
   pm.addPass(hlfir::createConvertHLFIRtoFIR());
+  if (enableOpenMP)
+    pm.addPass(flangomp::createLowerWorkshare());
 }
 
 /// Create a pass pipeline for handling certain OpenMP transformations needed
@@ -303,7 +305,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
 void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
                                   MLIRToLLVMPassPipelineConfig &config,
                                   llvm::StringRef inputFilename) {
-  fir::createHLFIRToFIRPassPipeline(pm, config.OptLevel);
+  fir::createHLFIRToFIRPassPipeline(pm, config.EnableOpenMP, config.OptLevel);
 
   // Add default optimizer pass pipeline.
   fir::createDefaultFIROptimizerPassPipeline(pm, config);
diff --git a/flang/lib/Optimizer/Transforms/CUFCommon.cpp b/flang/lib/Optimizer/Transforms/CUFCommon.cpp
index 5eca86529f9e1..162df8f9cab9c 100644
--- a/flang/lib/Optimizer/Transforms/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFCommon.cpp
@@ -22,9 +22,6 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
   mlir::OpBuilder builder(ctx);
   auto gpuMod = builder.create<mlir::gpu::GPUModuleOp>(mod.getLoc(),
                                                        cudaDeviceModuleName);
-  llvm::SmallVector<mlir::Attribute> targets;
-  targets.push_back(mlir::NVVM::NVVMTargetAttr::get(ctx));
-  gpuMod.setTargetsAttr(builder.getArrayAttr(targets));
   mlir::Block::iterator insertPt(mod.getBodyRegion().front().end());
   symTab.insert(gpuMod, insertPt);
   return gpuMod;
diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
index a69b47ff74391..714b0b291be1e 100644
--- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
@@ -26,25 +26,37 @@ namespace fir {
 
 namespace {
 
+static void processAddrOfOp(fir::AddrOfOp addrOfOp,
+                            mlir::SymbolTable &symbolTable, bool onlyConstant) {
+  if (auto globalOp = symbolTable.lookup<fir::GlobalOp>(
+          addrOfOp.getSymbol().getRootReference().getValue())) {
+    bool isCandidate{(onlyConstant ? globalOp.getConstant() : true) &&
+                     !globalOp.getDataAttr()};
+    if (isCandidate)
+      globalOp.setDataAttrAttr(cuf::DataAttributeAttr::get(
+          addrOfOp.getContext(), globalOp.getConstant()
+                                     ? cuf::DataAttribute::Constant
+                                     : cuf::DataAttribute::Device));
+  }
+}
+
 static void prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp,
                                          mlir::SymbolTable &symbolTable,
                                          bool onlyConstant = true) {
   auto cudaProcAttr{
       funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName())};
-  if (!cudaProcAttr || cudaProcAttr.getValue() == cuf::ProcAttribute::Host)
-    return;
-  for (auto addrOfOp : funcOp.getBody().getOps<fir::AddrOfOp>()) {
-    if (auto globalOp = symbolTable.lookup<fir::GlobalOp>(
-            addrOfOp.getSymbol().getRootReference().getValue())) {
-      bool isCandidate{(onlyConstant ? globalOp.getConstant() : true) &&
-                       !globalOp.getDataAttr()};
-      if (isCandidate)
-        globalOp.setDataAttrAttr(cuf::DataAttributeAttr::get(
-            funcOp.getContext(), globalOp.getConstant()
-                                     ? cuf::DataAttribute::Constant
-                                     : cuf::DataAttribute::Device));
+  if (!cudaProcAttr || cudaProcAttr.getValue() == cuf::ProcAttribute::Host) {
+    // Look for globlas in CUF KERNEL DO operations.
+    for (auto cufKernelOp : funcOp.getBody().getOps<cuf::KernelOp>()) {
+      cufKernelOp.walk([&](fir::AddrOfOp addrOfOp) {
+        processAddrOfOp(addrOfOp, symbolTable, onlyConstant);
+      });
     }
+    return;
   }
+  funcOp.walk([&](fir::AddrOfOp addrOfOp) {
+    processAddrOfOp(addrOfOp, symbolTable, onlyConstant);
+  });
 }
 
 class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 9de20f0f0d45e..f1ebd08967b9a 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -628,6 +628,12 @@ struct CUFDataTransferOpConversion
 
       mlir::Value dst = getDeviceAddress(rewriter, op.getDstMutable(), symtab);
       mlir::Value src = getDeviceAddress(rewriter, op.getSrcMutable(), symtab);
+      // Materialize the src if constant.
+      if (matchPattern(src.getDefiningOp(), mlir::m_Constant())) {
+        mlir::Value temp = builder.createTemporary(loc, srcTy);
+        builder.create<fir::StoreOp>(loc, src, temp);
+        src = temp;
+      }
       llvm::SmallVector<mlir::Value> args{
           fir::runtime::createArguments(builder, loc, fTy, dst, src, bytes,
                                         modeValue, sourceFile, sourceLine)};
@@ -654,7 +660,7 @@ struct CUFDataTransferOpConversion
               loc, builder);
       }
       auto materializeBoxIfNeeded = [&](mlir::Value val) -> mlir::Value {
-        if (mlir::isa<fir::EmboxOp>(val.getDefiningOp())) {
+        if (mlir::isa<fir::EmboxOp, fir::ReboxOp>(val.getDefiningOp())) {
           // Materialize the box to memory to be able to call the runtime.
           mlir::Value box = builder.createTemporary(loc, val.getType());
           builder.create<fir::StoreOp>(loc, val, box);
diff --git a/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp b/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp
index 648628fd1c9af..cfd90ff723793 100644
--- a/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp
@@ -12,6 +12,7 @@
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
@@ -58,26 +59,36 @@ void ExternalNameConversionPass::runOnOperation() {
   auto *context = &getContext();
 
   llvm::DenseMap<mlir::StringAttr, mlir::FlatSymbolRefAttr> remappings;
-  // Update names of external Fortran functions and names of Common Block
-  // globals.
-  for (auto &funcOrGlobal : op->getRegion(0).front()) {
-    if (llvm::isa<mlir::func::FuncOp>(funcOrGlobal) ||
-        llvm::isa<fir::GlobalOp>(funcOrGlobal)) {
-      auto symName = funcOrGlobal.getAttrOfType<mlir::StringAttr>(
-          mlir::SymbolTable::getSymbolAttrName());
-      auto deconstructedName = fir::NameUniquer::deconstruct(symName);
-      if (fir::NameUniquer::isExternalFacingUniquedName(deconstructedName)) {
-        auto newName =
-            mangleExternalName(deconstructedName, appendUnderscoreOpt);
-        auto newAttr = mlir::StringAttr::get(context, newName);
-        mlir::SymbolTable::setSymbolName(&funcOrGlobal, newAttr);
-        auto newSymRef = mlir::FlatSymbolRefAttr::get(newAttr);
-        remappings.try_emplace(symName, newSymRef);
-        if (llvm::isa<mlir::func::FuncOp>(funcOrGlobal))
-          funcOrGlobal.setAttr(fir::getInternalFuncNameAttrName(), symName);
+
+  auto renameFuncOrGlobalInModule = [&](mlir::Operation *module) {
+    for (auto &funcOrGlobal : module->getRegion(0).front()) {
+      if (llvm::isa<mlir::func::FuncOp>(funcOrGlobal) ||
+          llvm::isa<fir::GlobalOp>(funcOrGlobal)) {
+        auto symName = funcOrGlobal.getAttrOfType<mlir::StringAttr>(
+            mlir::SymbolTable::getSymbolAttrName());
+        auto deconstructedName = fir::NameUniquer::deconstruct(symName);
+        if (fir::NameUniquer::isExternalFacingUniquedName(deconstructedName)) {
+          auto newName =
+              mangleExternalName(deconstructedName, appendUnderscoreOpt);
+          auto newAttr = mlir::StringAttr::get(context, newName);
+          mlir::SymbolTable::setSymbolName(&funcOrGlobal, newAttr);
+          auto newSymRef = mlir::FlatSymbolRefAttr::get(newAttr);
+          remappings.try_emplace(symName, newSymRef);
+          if (llvm::isa<mlir::func::FuncOp>(funcOrGlobal))
+            funcOrGlobal.setAttr(fir::getInternalFuncNameAttrName(), symName);
+        }
       }
     }
-  }
+  };
+
+  // Update names of external Fortran functions and names of Common Block
+  // globals.
+  renameFuncOrGlobalInModule(op);
+
+  // Do the same in GPU modules.
+  if (auto mod = mlir::dyn_cast_or_null<mlir::ModuleOp>(*op))
+    for (auto gpuMod : mod.getOps<mlir::gpu::GPUModuleOp>())
+      renameFuncOrGlobalInModule(gpuMod);
 
   if (remappings.empty())
     return;
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 5ead9a48fa896..630acf9a6b256 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -97,7 +97,7 @@ template <typename Separator> struct MapModifiers {
 
   // Parsing of mappers is not supported yet.
   using TypeModParser = Parser<OmpMapClause::TypeModifier>;
-  using IterParser = Parser<OmpIteratorModifier>;
+  using IterParser = Parser<OmpIterator>;
   using TypeParser = Parser<OmpMapClause::Type>;
   using ModParser =
       ConcatSeparated<Separator, TypeModParser, IterParser, TypeParser>;
@@ -131,9 +131,8 @@ template <typename Separator> struct MotionModifiers {
   constexpr MotionModifiers(const MotionModifiers &) = default;
   constexpr MotionModifiers(MotionModifiers &&) = default;
 
-  // Parsing of mappers if not implemented yet.
   using ExpParser = Parser<OmpFromClause::Expectation>;
-  using IterParser = Parser<OmpIteratorModifier>;
+  using IterParser = Parser<OmpIterator>;
   using ModParser = ConcatSeparated<Separator, ExpParser, IterParser>;
 
   using resultType = typename ModParser::resultType;
@@ -191,6 +190,8 @@ static TypeDeclarationStmt makeIterSpecDecl(std::list<ObjectName> &&names) {
       makeEntityList(std::move(names)));
 }
 
+// --- Parsers for clause modifiers -----------------------------------
+
 TYPE_PARSER(construct<OmpIteratorSpecifier>(
     // Using Parser<TypeDeclarationStmt> or Parser<EntityDecl> has the problem
     // that they will attempt to treat what follows the '=' as initialization.
@@ -207,14 +208,40 @@ TYPE_PARSER(construct<OmpIteratorSpecifier>(
             makeIterSpecDecl, nonemptyList(Parser<ObjectName>{}) / "="_tok)),
     subscriptTriplet))
 
+TYPE_PARSER(construct<OmpDependenceType>(
+    "SINK" >> pure(OmpDependenceType::Value::Sink) ||
+    "SOURCE" >> pure(OmpDependenceType::Value::Source)))
+
 // [5.0] 2.1.6 iterator -> iterator-specifier-list
-TYPE_PARSER(construct<OmpIteratorModifier>("ITERATOR" >>
+TYPE_PARSER(construct<OmpIterator>("ITERATOR" >>
     parenthesized(nonemptyList(sourced(Parser<OmpIteratorSpecifier>{})))))
 
+// 2.15.3.7 LINEAR (linear-list: linear-step)
+//          linear-list -> list | modifier(list)
+//          linear-modifier -> REF | VAL | UVAL
+TYPE_PARSER(construct<OmpLinearModifier>( //
+    "REF" >> pure(OmpLinearModifier::Value::Ref) ||
+    "VAL" >> pure(OmpLinearModifier::Value::Val) ||
+    "UVAL" >> pure(OmpLinearModifier::Value::Uval)))
+
+// 2.15.3.6 REDUCTION (reduction-identifier: variable-name-list)
+TYPE_PARSER(construct<OmpReductionIdentifier>(Parser<DefinedOperator>{}) ||
+    construct<OmpReductionIdentifier>(Parser<ProcedureDesignator>{}))
+
+TYPE_PARSER(construct<OmpTaskDependenceType>(
+    "DEPOBJ" >> pure(OmpTaskDependenceType::Value::Depobj) ||
+    "IN"_id >> pure(OmpTaskDependenceType::Value::In) ||
+    "INOUT"_id >> pure(OmpTaskDependenceType::Value::Inout) ||
+    "INOUTSET"_id >> pure(OmpTaskDependenceType::Value::Inoutset) ||
+    "MUTEXINOUTSET" >> pure(OmpTaskDependenceType::Value::Mutexinoutset) ||
+    "OUT" >> pure(OmpTaskDependenceType::Value::Out)))
+
+// --- Parsers for clauses --------------------------------------------
+
 // [5.0] 2.10.1 affinity([aff-modifier:] locator-list)
 //              aff-modifier: interator-modifier
 TYPE_PARSER(construct<OmpAffinityClause>(
-    maybe(Parser<OmpIteratorModifier>{} / ":"), Parser<OmpObjectList>{}))
+    maybe(Parser<OmpIterator>{} / ":"), Parser<OmpObjectList>{}))
 
 // 2.15.3.1 DEFAULT (PRIVATE | FIRSTPRIVATE | SHARED | NONE)
 TYPE_PARSER(construct<OmpDefaultClause>(
@@ -250,21 +277,26 @@ TYPE_PARSER(
         "TOFROM" >> pure(OmpMapClause::Type::Tofrom)))
 
 template <bool CommasEverywhere>
-static inline OmpMapClause makeMapClause(
+static inline OmpMapClause makeMapClause(OmpMapperIdentifier &&mm,
     std::tuple<std::optional<std::list<OmpMapClause::TypeModifier>>,
-        std::optional<std::list<OmpIteratorModifier>>,
+        std::optional<std::list<OmpIterator>>,
         std::optional<std::list<OmpMapClause::Type>>> &&mods,
     OmpObjectList &&objs) {
   auto &&[tm, it, ty] = std::move(mods);
-  return OmpMapClause{std::move(tm), std::move(it), std::move(ty),
-      std::move(objs), CommasEverywhere};
+  return OmpMapClause{std::move(mm), std::move(tm), std::move(it),
+      std::move(ty), std::move(objs), CommasEverywhere};
 }
 
+TYPE_PARSER(construct<OmpMapperIdentifier>(
+    maybe("MAPPER"_tok >> parenthesized(name) / ","_tok)))
+
 TYPE_PARSER(construct<OmpMapClause>(
-    applyFunction<OmpMapClause>(
-        makeMapClause<true>, MapModifiers(","_tok), Parser<OmpObjectList>{}) ||
+    applyFunction<OmpMapClause>(makeMapClause<true>,
+        Parser<OmpMapperIdentifier>{}, MapModifiers(","_tok),
+        Parser<OmpObjectList>{}) ||
     applyFunction<OmpMapClause>(makeMapClause<false>,
-        MapModifiers(maybe(","_tok)), Parser<OmpObjectList>{})))
+        Parser<OmpMapperIdentifier>{}, MapModifiers(maybe(","_tok)),
+        Parser<OmpObjectList>{})))
 
 // [OpenMP 5.0]
 // 2.19.7.2 defaultmap(implicit-behavior[:variable-category])
@@ -346,21 +378,17 @@ TYPE_PARSER(construct<OmpIfClause>(
         ":"),
     scalarLogicalExpr))
 
-// 2.15.3.6 REDUCTION (reduction-identifier: variable-name-list)
-TYPE_PARSER(construct<OmpReductionOperator>(Parser<DefinedOperator>{}) ||
-    construct<OmpReductionOperator>(Parser<ProcedureDesignator>{}))
-
 TYPE_PARSER(construct<OmpReductionClause>(
     maybe(
         ("INSCAN" >> pure(OmpReductionClause::ReductionModifier::Inscan) ||
             "TASK" >> pure(OmpReductionClause::ReductionModifier::Task) ||
             "DEFAULT" >> pure(OmpReductionClause::ReductionModifier::Default)) /
         ","),
-    Parser<OmpReductionOperator>{} / ":", Parser<OmpObjectList>{}))
+    Parser<OmpReductionIdentifier>{} / ":", Parser<OmpObjectList>{}))
 
 // OMP 5.0 2.19.5.6 IN_REDUCTION (reduction-identifier: variable-name-list)
 TYPE_PARSER(construct<OmpInReductionClause>(
-    Parser<OmpReductionOperator>{} / ":", Parser<OmpObjectList>{}))
+    Parser<OmpReductionIdentifier>{} / ":", Parser<OmpObjectList>{}))
 
 // OMP 5.0 2.11.4 allocate-clause -> ALLOCATE ([allocator:] variable-name-list)
 // OMP 5.2 2.13.4 allocate-clause -> ALLOCATE ([allocate-modifier
@@ -393,18 +421,6 @@ TYPE_PARSER(construct<OmpAllocateClause>(
         ":"),
     Parser<OmpObjectList>{}))
 
-TYPE_PARSER(construct<OmpDependenceType>(
-    "SINK" >> pure(OmpDependenceType::Type::Sink) ||
-    "SOURCE" >> pure(OmpDependenceType::Type::Source)))
-
-TYPE_PARSER(construct<OmpTaskDependenceType>(
-    "DEPOBJ" >> pure(OmpTaskDependenceType::Type::Depobj) ||
-    "IN"_id >> pure(OmpTaskDependenceType::Type::In) ||
-    "INOUT"_id >> pure(OmpTaskDependenceType::Type::Inout) ||
-    "INOUTSET"_id >> pure(OmpTaskDependenceType::Type::Inoutset) ||
-    "MUTEXINOUTSET" >> pure(OmpTaskDependenceType::Type::Mutexinoutset) ||
-    "OUT" >> pure(OmpTaskDependenceType::Type::Out)))
-
 // iteration-offset -> +/- non-negative-constant-expr
 TYPE_PARSER(construct<OmpIterationOffset>(
     Parser<DefinedOperator>{}, scalarIntConstantExpr))
@@ -422,7 +438,7 @@ TYPE_PARSER(construct<OmpDoacross>(
 TYPE_CONTEXT_PARSER("Omp Depend clause"_en_US,
     construct<OmpDependClause>(
         construct<OmpDependClause>(construct<OmpDependClause::TaskDep>(
-            maybe(Parser<OmpIteratorModifier>{} / ","_tok),
+            maybe(Parser<OmpIterator>{} / ","_tok),
             Parser<OmpTaskDependenceType>{} / ":", Parser<OmpObjectList>{})) ||
         construct<OmpDependClause>(Parser<OmpDoacross>{})))
 
@@ -435,7 +451,7 @@ TYPE_PARSER(construct<OmpFromClause::Expectation>(
 template <typename MotionClause, bool CommasEverywhere>
 static inline MotionClause makeMotionClause(
     std::tuple<std::optional<std::list<typename MotionClause::Expectation>>,
-        std::optional<std::list<OmpIteratorModifier>>> &&mods,
+        std::optional<std::list<OmpIterator>>> &&mods,
     OmpObjectList &&objs) {
   auto &&[exp, iter] = std::move(mods);
   return MotionClause(
@@ -454,14 +470,6 @@ TYPE_PARSER(construct<OmpToClause>(
     applyFunction<OmpToClause>(makeMotionClause<OmpToClause, false>,
         MotionModifiers(maybe(","_tok)), Parser<OmpObjectList>{})))
 
-// 2.15.3.7 LINEAR (linear-list: linear-step)
-//          linear-list -> list | modifier(list)
-//          linear-modifier -> REF | VAL | UVAL
-TYPE_PARSER(
-    construct<OmpLinearModifier>("REF" >> pure(OmpLinearModifier::Type::Ref) ||
-        "VAL" >> pure(OmpLinearModifier::Type::Val) ||
-        "UVAL" >> pure(OmpLinearModifier::Type::Uval)))
-
 TYPE_CONTEXT_PARSER("Omp LINEAR clause"_en_US,
     construct<OmpLinearClause>(
         construct<OmpLinearClause>(construct<OmpLinearClause::WithModifier>(
@@ -844,7 +852,7 @@ TYPE_PARSER(construct<OmpReductionInitializerClause>(
 // 2.16 Declare Reduction Construct
 TYPE_PARSER(sourced(construct<OpenMPDeclareReductionConstruct>(
     verbatim("DECLARE REDUCTION"_tok),
-    "(" >> Parser<OmpReductionOperator>{} / ":",
+    "(" >> Parser<OmpReductionIdentifier>{} / ":",
     nonemptyList(Parser<DeclarationTypeSpec>{}) / ":",
     Parser<OmpReductionCombiner>{} / ")",
     maybe(Parser<OmpReductionInitializerClause>{}))))
diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp
index 574e5fd84862e..24b2902f286f4 100644
--- a/flang/lib/Parser/parse-tree.cpp
+++ b/flang/lib/Parser/parse-tree.cpp
@@ -253,20 +253,20 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Name &x) {
   return os << x.ToString();
 }
 
-OmpDependenceType::Type OmpDoacross::GetDepType() const {
+OmpDependenceType::Value OmpDoacross::GetDepType() const {
   return common::visit( //
       common::visitors{
           [](const OmpDoacross::Sink &) {
-            return OmpDependenceType::Type::Sink;
+            return OmpDependenceType::Value::Sink;
           },
           [](const OmpDoacross::Source &) {
-            return OmpDependenceType::Type::Source;
+            return OmpDependenceType::Value::Source;
           },
       },
       u);
 }
 
-OmpTaskDependenceType::Type OmpDependClause::TaskDep::GetTaskDepType() const {
+OmpTaskDependenceType::Value OmpDependClause::TaskDep::GetTaskDepType() const {
   return std::get<parser::OmpTaskDependenceType>(t).v;
 }
 
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index a782dfb8d767a..4d6aaceb69c18 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2079,7 +2079,7 @@ class UnparseVisitor {
     Put(" = ");
     Walk(std::get<SubscriptTriplet>(x.t));
   }
-  void Unparse(const OmpIteratorModifier &x) {
+  void Unparse(const OmpIterator &x) {
     Word("ITERATOR(");
     Walk(x.v);
     Put(")");
@@ -2093,14 +2093,24 @@ class UnparseVisitor {
   void Unparse(const OmpMapClause &x) {
     auto &typeMod =
         std::get<std::optional<std::list<OmpMapClause::TypeModifier>>>(x.t);
-    auto &iter = std::get<std::optional<std::list<OmpIteratorModifier>>>(x.t);
+    auto &iter = std::get<std::optional<std::list<OmpIterator>>>(x.t);
     auto &type = std::get<std::optional<std::list<OmpMapClause::Type>>>(x.t);
+    auto &mapper = std::get<OmpMapperIdentifier>(x.t);
 
     // For a given list of items, if the item has a value, then walk it.
     // Print commas between items that have values.
     // Return 'true' if something did get printed, otherwise 'false'.
     bool needComma{false};
+    if (mapper.v) {
+      Word("MAPPER(");
+      Walk(*mapper.v);
+      Put(")");
+      needComma = true;
+    }
     if (typeMod) {
+      if (needComma) {
+        Put(", ");
+      }
       Walk(*typeMod);
       needComma = true;
     }
@@ -2137,7 +2147,7 @@ class UnparseVisitor {
     Walk(std::get<ScalarIntExpr>(x.t));
   }
   void Unparse(const OmpAffinityClause &x) {
-    Walk(std::get<std::optional<OmpIteratorModifier>>(x.t), ":");
+    Walk(std::get<std::optional<OmpIterator>>(x.t), ":");
     Walk(std::get<OmpObjectList>(x.t));
   }
   void Unparse(const OmpAlignedClause &x) {
@@ -2148,7 +2158,7 @@ class UnparseVisitor {
   void Unparse(const OmpFromClause &x) {
     auto &expect{
         std::get<std::optional<std::list<OmpFromClause::Expectation>>>(x.t)};
-    auto &iter{std::get<std::optional<std::list<OmpIteratorModifier>>>(x.t)};
+    auto &iter{std::get<std::optional<std::list<OmpIterator>>>(x.t)};
     bool needComma{false};
     if (expect) {
       Walk(*expect);
@@ -2181,13 +2191,13 @@ class UnparseVisitor {
   void Unparse(const OmpReductionClause &x) {
     Walk(std::get<std::optional<OmpReductionClause::ReductionModifier>>(x.t),
         ",");
-    Walk(std::get<OmpReductionOperator>(x.t));
+    Walk(std::get<OmpReductionIdentifier>(x.t));
     Put(":");
     Walk(std::get<OmpObjectList>(x.t));
   }
   void Unparse(const OmpDetachClause &x) { Walk(x.v); }
   void Unparse(const OmpInReductionClause &x) {
-    Walk(std::get<OmpReductionOperator>(x.t));
+    Walk(std::get<OmpReductionIdentifier>(x.t));
     Put(":");
     Walk(std::get<OmpObjectList>(x.t));
   }
@@ -2253,7 +2263,7 @@ class UnparseVisitor {
   void Unparse(const OmpToClause &x) {
     auto &expect{
         std::get<std::optional<std::list<OmpToClause::Expectation>>>(x.t)};
-    auto &iter{std::get<std::optional<std::list<OmpIteratorModifier>>>(x.t)};
+    auto &iter{std::get<std::optional<std::list<OmpIterator>>>(x.t)};
     bool needComma{false};
     if (expect) {
       Walk(*expect);
@@ -2635,7 +2645,7 @@ class UnparseVisitor {
   }
   void Unparse(const OpenMPDeclareReductionConstruct &x) {
     Put("(");
-    Walk(std::get<OmpReductionOperator>(x.t)), Put(" : ");
+    Walk(std::get<OmpReductionIdentifier>(x.t)), Put(" : ");
     Walk(std::get<std::list<DeclarationTypeSpec>>(x.t), ","), Put(" : ");
     Walk(std::get<OmpReductionCombiner>(x.t));
     Put(")");
@@ -2900,8 +2910,8 @@ class UnparseVisitor {
   WALK_NESTED_ENUM(
       OmpLastprivateClause, LastprivateModifier) // OMP lastprivate-modifier
   WALK_NESTED_ENUM(OmpScheduleModifierType, ModType) // OMP schedule-modifier
-  WALK_NESTED_ENUM(OmpLinearModifier, Type) // OMP linear-modifier
-  WALK_NESTED_ENUM(OmpTaskDependenceType, Type) // OMP task-dependence-type
+  WALK_NESTED_ENUM(OmpLinearModifier, Value) // OMP linear-modifier
+  WALK_NESTED_ENUM(OmpTaskDependenceType, Value) // OMP task-dependence-type
   WALK_NESTED_ENUM(OmpScheduleClause, ScheduleType) // OMP schedule-type
   WALK_NESTED_ENUM(OmpDeviceClause, DeviceModifier) // OMP device modifier
   WALK_NESTED_ENUM(OmpDeviceTypeClause, Type) // OMP DEVICE_TYPE
diff --git a/flang/lib/Semantics/CMakeLists.txt b/flang/lib/Semantics/CMakeLists.txt
index 41406ecf50e00..7855ae7eed138 100644
--- a/flang/lib/Semantics/CMakeLists.txt
+++ b/flang/lib/Semantics/CMakeLists.txt
@@ -31,6 +31,7 @@ add_flang_library(FortranSemantics
   definable.cpp
   expression.cpp
   mod-file.cpp
+  openmp-modifiers.cpp
   pointer-assignment.cpp
   program-tree.cpp
   resolve-labels.cpp
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index eaf1d52a9fc1a..79b7a26ef222f 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -91,6 +91,37 @@ struct DeviceExprChecker
   }
 };
 
+struct FindHostArray
+    : public evaluate::AnyTraverse<FindHostArray, const Symbol *> {
+  using Result = const Symbol *;
+  using Base = evaluate::AnyTraverse<FindHostArray, Result>;
+  FindHostArray() : Base(*this) {}
+  using Base::operator();
+  Result operator()(const evaluate::Component &x) const {
+    const Symbol &symbol{x.GetLastSymbol()};
+    if (IsAllocatableOrPointer(symbol)) {
+      if (Result hostArray{(*this)(symbol)}) {
+        return hostArray;
+      }
+    }
+    return (*this)(x.base());
+  }
+  Result operator()(const Symbol &symbol) const {
+    if (const auto *details{
+            symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()}) {
+      if (details->IsArray() &&
+          (!details->cudaDataAttr() ||
+              (details->cudaDataAttr() &&
+                  *details->cudaDataAttr() != common::CUDADataAttr::Device &&
+                  *details->cudaDataAttr() != common::CUDADataAttr::Managed &&
+                  *details->cudaDataAttr() != common::CUDADataAttr::Unified))) {
+        return &symbol;
+      }
+    }
+    return nullptr;
+  }
+};
+
 template <typename A> static MaybeMsg CheckUnwrappedExpr(const A &x) {
   if (const auto *expr{parser::Unwrap<parser::Expr>(x)}) {
     return DeviceExprChecker{}(expr->typedExpr);
@@ -306,22 +337,11 @@ template <bool IsCUFKernelDo> class DeviceContextChecker {
     }
   }
   template <typename A>
-  void ErrorIfHostSymbol(const A &expr, const parser::CharBlock &source) {
-    for (const Symbol &sym : CollectCudaSymbols(expr)) {
-      if (const auto *details =
-              sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
-        if (details->IsArray() &&
-            (!details->cudaDataAttr() ||
-                (details->cudaDataAttr() &&
-                    *details->cudaDataAttr() != common::CUDADataAttr::Device &&
-                    *details->cudaDataAttr() != common::CUDADataAttr::Managed &&
-                    *details->cudaDataAttr() !=
-                        common::CUDADataAttr::Unified))) {
-          context_.Say(source,
-              "Host array '%s' cannot be present in CUF kernel"_err_en_US,
-              sym.name());
-        }
-      }
+  void ErrorIfHostSymbol(const A &expr, parser::CharBlock source) {
+    if (const Symbol * hostArray{FindHostArray{}(expr)}) {
+      context_.Say(source,
+          "Host array '%s' cannot be present in CUF kernel"_err_en_US,
+          hostArray->name());
     }
   }
   void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 354594f3339df..c9656d031b2e1 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -1110,7 +1110,8 @@ void CheckHelper::CheckPointerInitialization(const Symbol &symbol) {
       if (proc->init() && *proc->init()) {
         // C1519 - must be nonelemental external or module procedure,
         // or an unrestricted specific intrinsic function.
-        const Symbol &ultimate{(*proc->init())->GetUltimate()};
+        const Symbol &local{DEREF(*proc->init())};
+        const Symbol &ultimate{local.GetUltimate()};
         bool checkTarget{true};
         if (ultimate.attrs().test(Attr::INTRINSIC)) {
           if (auto intrinsic{context_.intrinsics().IsSpecificIntrinsicFunction(
@@ -1123,11 +1124,12 @@ void CheckHelper::CheckPointerInitialization(const Symbol &symbol) {
                 ultimate.name(), symbol.name());
             checkTarget = false;
           }
-        } else if ((!ultimate.attrs().test(Attr::EXTERNAL) &&
-                       ultimate.owner().kind() != Scope::Kind::Module) ||
+        } else if (!(ultimate.attrs().test(Attr::EXTERNAL) ||
+                       ultimate.owner().kind() == Scope::Kind::Module ||
+                       ultimate.owner().IsTopLevel()) ||
             IsDummy(ultimate) || IsPointer(ultimate)) {
-          context_.Say("Procedure pointer '%s' initializer '%s' is neither "
-                       "an external nor a module procedure"_err_en_US,
+          context_.Say(
+              "Procedure pointer '%s' initializer '%s' is neither an external nor a module procedure"_err_en_US,
               symbol.name(), ultimate.name());
           checkTarget = false;
         } else if (IsElementalProcedure(ultimate)) {
diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index eeeda553d8a46..3c99163c1f134 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -860,6 +860,8 @@ void IoChecker::Leave(const parser::WriteStmt &writeStmt) {
 
 void IoChecker::LeaveReadWrite() const {
   CheckForRequiredSpecifier(IoSpecKind::Unit); // C1211
+  CheckForRequiredSpecifier(flags_.test(Flag::InternalUnit),
+      "UNIT=internal-file", flags_.test(Flag::FmtOrNml), "FMT or NML");
   CheckForProhibitedSpecifier(IoSpecKind::Nml, IoSpecKind::Rec); // C1216
   CheckForProhibitedSpecifier(IoSpecKind::Nml, IoSpecKind::Fmt); // C1216
   CheckForProhibitedSpecifier(
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 0b64a4a9801cc..9cac652216fcf 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -683,8 +683,7 @@ void OmpStructureChecker::CheckIteratorRange(
   }
 }
 
-void OmpStructureChecker::CheckIteratorModifier(
-    const parser::OmpIteratorModifier &x) {
+void OmpStructureChecker::CheckIteratorModifier(const parser::OmpIterator &x) {
   // Check if all iterator variables have integer type.
   for (auto &&iterSpec : x.v) {
     bool isInteger{true};
@@ -1859,21 +1858,21 @@ void OmpStructureChecker::CheckTargetUpdate() {
 }
 
 void OmpStructureChecker::CheckTaskDependenceType(
-    const parser::OmpTaskDependenceType::Type &x) {
+    const parser::OmpTaskDependenceType::Value &x) {
   // Common checks for task-dependence-type (DEPEND and UPDATE clauses).
   unsigned version{context_.langOptions().OpenMPVersion};
   unsigned since{0};
 
   switch (x) {
-  case parser::OmpTaskDependenceType::Type::In:
-  case parser::OmpTaskDependenceType::Type::Out:
-  case parser::OmpTaskDependenceType::Type::Inout:
+  case parser::OmpTaskDependenceType::Value::In:
+  case parser::OmpTaskDependenceType::Value::Out:
+  case parser::OmpTaskDependenceType::Value::Inout:
     break;
-  case parser::OmpTaskDependenceType::Type::Mutexinoutset:
-  case parser::OmpTaskDependenceType::Type::Depobj:
+  case parser::OmpTaskDependenceType::Value::Mutexinoutset:
+  case parser::OmpTaskDependenceType::Value::Depobj:
     since = 50;
     break;
-  case parser::OmpTaskDependenceType::Type::Inoutset:
+  case parser::OmpTaskDependenceType::Value::Inoutset:
     since = 52;
     break;
   }
@@ -1888,14 +1887,14 @@ void OmpStructureChecker::CheckTaskDependenceType(
 }
 
 void OmpStructureChecker::CheckDependenceType(
-    const parser::OmpDependenceType::Type &x) {
+    const parser::OmpDependenceType::Value &x) {
   // Common checks for dependence-type (DEPEND and UPDATE clauses).
   unsigned version{context_.langOptions().OpenMPVersion};
   unsigned deprecatedIn{~0u};
 
   switch (x) {
-  case parser::OmpDependenceType::Type::Source:
-  case parser::OmpDependenceType::Type::Sink:
+  case parser::OmpDependenceType::Value::Source:
+  case parser::OmpDependenceType::Value::Sink:
     deprecatedIn = 52;
     break;
   }
@@ -2864,7 +2863,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Reduction &x) {
 bool OmpStructureChecker::CheckReductionOperators(
     const parser::OmpClause::Reduction &x) {
 
-  const auto &definedOp{std::get<parser::OmpReductionOperator>(x.v.t)};
+  const auto &definedOp{std::get<parser::OmpReductionIdentifier>(x.v.t)};
   bool ok = false;
   common::visit(
       common::visitors{
@@ -2929,7 +2928,7 @@ bool OmpStructureChecker::CheckIntrinsicOperator(
 
 static bool IsReductionAllowedForType(
     const parser::OmpClause::Reduction &x, const DeclTypeSpec &type) {
-  const auto &definedOp{std::get<parser::OmpReductionOperator>(x.v.t)};
+  const auto &definedOp{std::get<parser::OmpReductionIdentifier>(x.v.t)};
   // TODO: user defined reduction operators. Just allow everything for now.
   bool ok{true};
 
@@ -3484,7 +3483,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Map &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_map);
   using TypeMod = parser::OmpMapClause::TypeModifier;
   using Type = parser::OmpMapClause::Type;
-  using IterMod = parser::OmpIteratorModifier;
+  using IterMod = parser::OmpIterator;
 
   unsigned version{context_.langOptions().OpenMPVersion};
   if (auto commas{std::get<bool>(x.v.t)}; !commas && version >= 52) {
@@ -3637,7 +3636,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) {
       if (taskDep) {
         if (version == 50) {
           invalidDep = taskDep->GetTaskDepType() ==
-              parser::OmpTaskDependenceType::Type::Depobj;
+              parser::OmpTaskDependenceType::Value::Depobj;
         }
       } else {
         invalidDep = true;
@@ -3684,7 +3683,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) {
         }
       }
     }
-    if (std::get<std::optional<parser::OmpIteratorModifier>>(taskDep->t)) {
+    if (std::get<std::optional<parser::OmpIterator>>(taskDep->t)) {
       unsigned allowedInVersion{50};
       if (version < allowedInVersion) {
         context_.Say(GetContext().clauseSource,
@@ -3923,7 +3922,8 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Update &x) {
     if (version >= 51) {
       bool invalidDep{false};
       if (taskType) {
-        invalidDep = taskType->v == parser::OmpTaskDependenceType::Type::Depobj;
+        invalidDep =
+            taskType->v == parser::OmpTaskDependenceType::Value::Depobj;
       } else {
         invalidDep = true;
       }
@@ -4058,7 +4058,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::From &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_from);
   unsigned version{context_.langOptions().OpenMPVersion};
   using ExpMod = parser::OmpFromClause::Expectation;
-  using IterMod = parser::OmpIteratorModifier;
+  using IterMod = parser::OmpIterator;
 
   if (auto &expMod{std::get<std::optional<std::list<ExpMod>>>(x.v.t)}) {
     unsigned allowedInVersion{51};
@@ -4122,7 +4122,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::To &x) {
   }
   assert(GetContext().directive == llvm::omp::OMPD_target_update);
   using ExpMod = parser::OmpFromClause::Expectation;
-  using IterMod = parser::OmpIteratorModifier;
+  using IterMod = parser::OmpIterator;
 
   if (auto &expMod{std::get<std::optional<std::list<ExpMod>>>(x.v.t)}) {
     unsigned allowedInVersion{51};
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 429e451c463e4..df21ebac0f6d7 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -202,7 +202,7 @@ class OmpStructureChecker
   void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock);
 
   void CheckIteratorRange(const parser::OmpIteratorSpecifier &x);
-  void CheckIteratorModifier(const parser::OmpIteratorModifier &x);
+  void CheckIteratorModifier(const parser::OmpIterator &x);
   void CheckLoopItrVariableIsInt(const parser::OpenMPLoopConstruct &x);
   void CheckDoWhile(const parser::OpenMPLoopConstruct &x);
   void CheckAssociatedLoopConstraints(const parser::OpenMPLoopConstruct &x);
@@ -218,8 +218,8 @@ class OmpStructureChecker
   void CheckSIMDNest(const parser::OpenMPConstruct &x);
   void CheckTargetNest(const parser::OpenMPConstruct &x);
   void CheckTargetUpdate();
-  void CheckDependenceType(const parser::OmpDependenceType::Type &x);
-  void CheckTaskDependenceType(const parser::OmpTaskDependenceType::Type &x);
+  void CheckDependenceType(const parser::OmpDependenceType::Value &x);
+  void CheckTaskDependenceType(const parser::OmpTaskDependenceType::Value &x);
   void CheckCancellationNest(
       const parser::CharBlock &source, const parser::OmpCancelType::Type &type);
   std::int64_t GetOrdCollapseLevel(const parser::OpenMPLoopConstruct &x);
diff --git a/flang/lib/Semantics/openmp-modifiers.cpp b/flang/lib/Semantics/openmp-modifiers.cpp
new file mode 100644
index 0000000000000..70ca988cddce5
--- /dev/null
+++ b/flang/lib/Semantics/openmp-modifiers.cpp
@@ -0,0 +1,146 @@
+//===-- flang/lib/Semantics/openmp-modifiers.cpp --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Semantics/openmp-modifiers.h"
+
+#include "flang/Parser/parse-tree.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+
+#include <algorithm>
+#include <cassert>
+#include <map>
+
+namespace Fortran::semantics {
+using namespace llvm::omp;
+
+/// Find the highest version that exists as a key in the given map,
+/// and is less than or equal to `version`.
+/// Account for "version" not being a value from getOpenMPVersions().
+template <typename ValueTy>
+static unsigned findVersion(
+    unsigned version, const std::map<unsigned, ValueTy> &map) {
+  llvm::ArrayRef<unsigned> versions{llvm::omp::getOpenMPVersions()};
+  assert(!versions.empty() && "getOpenMPVersions returned empty list");
+  version = std::clamp(version, versions.front(), versions.back());
+
+  // std::map is sorted with respect to keys, by default in the ascending
+  // order.
+  unsigned found{0};
+  for (auto &[v, _] : map) {
+    if (v <= version) {
+      found = v;
+    } else {
+      break;
+    }
+  }
+
+  assert(found != 0 && "cannot locate entry for version in map");
+  return found;
+}
+
+const OmpProperties &OmpModifierDescriptor::props(unsigned version) const {
+  return props_.at(findVersion(version, props_));
+}
+
+const OmpClauses &OmpModifierDescriptor::clauses(unsigned version) const {
+  return clauses_.at(findVersion(version, clauses_));
+}
+
+// Note: The intent for these functions is to have them be automatically-
+// generated in the future.
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpDependenceType>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"dependence-type",
+      /*props=*/
+      {
+          {45, {OmpProperty::Required, OmpProperty::Ultimate}},
+      },
+      /*clauses=*/
+      {
+          {45, {Clause::OMPC_depend}},
+          {51, {Clause::OMPC_depend, Clause::OMPC_update}},
+          {52, {Clause::OMPC_doacross}},
+      },
+  };
+  return desc;
+}
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpIterator>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"iterator",
+      /*props=*/
+      {
+          {50, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {50, {Clause::OMPC_affinity, Clause::OMPC_depend}},
+          {51,
+              {Clause::OMPC_affinity, Clause::OMPC_depend, Clause::OMPC_from,
+                  Clause::OMPC_map, Clause::OMPC_to}},
+      },
+  };
+  return desc;
+}
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpLinearModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"linear-modifier",
+      /*props=*/
+      {
+          {45, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {45, {Clause::OMPC_linear}},
+      },
+  };
+  return desc;
+}
+
+template <>
+const OmpModifierDescriptor &
+OmpGetDescriptor<parser::OmpReductionIdentifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"reduction-identifier",
+      /*props=*/
+      {
+          {45, {OmpProperty::Required, OmpProperty::Ultimate}},
+      },
+      /*clauses=*/
+      {
+          {45, {Clause::OMPC_reduction}},
+          {50,
+              {Clause::OMPC_in_reduction, Clause::OMPC_reduction,
+                  Clause::OMPC_task_reduction}},
+      },
+  };
+  return desc;
+}
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpTaskDependenceType>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"task-dependence-type",
+      /*props=*/
+      {
+          {52, {OmpProperty::Required, OmpProperty::Ultimate}},
+      },
+      /*clauses=*/
+      {
+          {52, {Clause::OMPC_depend, Clause::OMPC_update}},
+      },
+  };
+  return desc;
+}
+} // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index a2059a1123b5e..80e238f3476ac 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -518,8 +518,8 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   }
 
   bool Pre(const parser::OmpClause::Reduction &x) {
-    const parser::OmpReductionOperator &opr{
-        std::get<parser::OmpReductionOperator>(x.v.t)};
+    const parser::OmpReductionIdentifier &opr{
+        std::get<parser::OmpReductionIdentifier>(x.v.t)};
     auto createDummyProcSymbol = [&](const parser::Name *name) {
       // If name resolution failed, create a dummy symbol
       const auto namePair{
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 09120e3ed0e97..929d35a4717dc 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1471,6 +1471,8 @@ class OmpVisitor : public virtual DeclarationVisitor {
 
   bool Pre(const parser::OpenMPDeclareMapperConstruct &);
 
+  bool Pre(const parser::OmpMapClause &);
+
   void Post(const parser::OmpBeginLoopDirective &) {
     messageHandler().set_currStmtSource(std::nullopt);
   }
@@ -1639,6 +1641,33 @@ bool OmpVisitor::Pre(const parser::OpenMPDeclareMapperConstruct &x) {
   return false;
 }
 
+bool OmpVisitor::Pre(const parser::OmpMapClause &x) {
+  const auto &mid{std::get<parser::OmpMapperIdentifier>(x.t)};
+  if (const auto &mapperName{mid.v}) {
+    if (const auto symbol = FindSymbol(currScope(), *mapperName)) {
+      // TODO: Do we need a specific flag or type here, to distinghuish against
+      // other ConstructName things? Leaving this for the full implementation
+      // of mapper lowering.
+      auto *misc{symbol->detailsIf<MiscDetails>()};
+      if (!misc || misc->kind() != MiscDetails::Kind::ConstructName)
+        context().Say(mapperName->source,
+            "Name '%s' should be a mapper name"_err_en_US, mapperName->source);
+      else
+        mapperName->symbol = symbol;
+    } else {
+      mapperName->symbol = &MakeSymbol(
+          *mapperName, MiscDetails{MiscDetails::Kind::ConstructName});
+      // TODO: When completing the implementation, we probably want to error if
+      // the symbol is not declared, but right now, testing that the TODO for
+      // OmpMapclause happens is obscured by the TODO for declare mapper, so
+      // leaving this out. Remove the above line once the declare mapper is
+      // implemented. context().Say(mapperName->source, "'%s' not
+      // declared"_err_en_US, mapperName->source);
+    }
+  }
+  return true;
+}
+
 // Walk the parse tree and resolve names to symbols.
 class ResolveNamesVisitor : public virtual ScopeHandler,
                             public ModuleVisitor,
diff --git a/flang/test/Driver/dynamic-linker.f90 b/flang/test/Driver/dynamic-linker.f90
index 6d5c443ab75cb..e850939374568 100644
--- a/flang/test/Driver/dynamic-linker.f90
+++ b/flang/test/Driver/dynamic-linker.f90
@@ -17,7 +17,7 @@
 ! GNU-LINKER-OPTIONS-SAME: "-static"
 ! GNU-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
 
-! RDYNAMIC-LINKER-OPTION: "{{.*}}ld"
+! RDYNAMIC-LINKER-OPTION: "{{.*}}ld{{(\.lld)?(\.exe)?}}"
 ! RDYNAMIC-LINKER-OPTION-SAME: "-export-dynamic"
 
 ! For MSVC, adding -static does not add any additional linker options.
diff --git a/flang/test/Driver/isysroot.f90 b/flang/test/Driver/isysroot.f90
index 28b435cce08ed..07ffb68653147 100644
--- a/flang/test/Driver/isysroot.f90
+++ b/flang/test/Driver/isysroot.f90
@@ -8,7 +8,7 @@
 ! RUN: %flang -### --target=aarch64-linux-gnu -isysroot /path/to/sysroot \
 ! RUN:        %s 2>&1 | FileCheck %s --check-prefix=CHECK-LINUX
 
-! CHECK-DARWIN: "{{.*}}ld{{(64)?(\.lld)?}}" {{.*}}"-syslibroot" "/path/to/sysroot"
+! CHECK-DARWIN: "{{.*}}ld{{(64)?(\.lld)?(\.exe)?}}" {{.*}}"-syslibroot" "/path/to/sysroot"
 ! Unused on Linux.
 ! CHECK-LINUX: warning: argument unused during compilation: '-isysroot /path/to/sysroot'
 ! CHECK-LINUX-NOT: /path/to/sysroot
diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir
index 49bb5bdf5e6bc..abf2d56695b17 100644
--- a/flang/test/Fir/CUDA/cuda-alloc-free.fir
+++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir
@@ -73,7 +73,7 @@ func.func @_QPtest_type() {
 // CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64
 // CHECK: fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
 
-gpu.module @cuda_device_mod [#nvvm.target] {
+gpu.module @cuda_device_mod {
   gpu.func @_QMalloc() kernel {
     %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QMallocEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
     gpu.return 
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index 99386abc4fafd..901497e2cde55 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -10,11 +10,11 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
     fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?xi32>>>
   }
 
-  gpu.module @cuda_device_mod [#nvvm.target] {
+  gpu.module @cuda_device_mod {
   }
 }
 
-// CHECK: gpu.module @cuda_device_mod [#nvvm.target] 
+// CHECK: gpu.module @cuda_device_mod
 
 // CHECK: llvm.func internal @__cudaFortranConstructor() {
 // CHECK-DAG: %[[MODULE:.*]] = cuf.register_module @cuda_device_mod -> !llvm.ptr
diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir
index 1ee44f3c6d97c..0f9ca6e640a80 100644
--- a/flang/test/Fir/CUDA/cuda-data-transfer.fir
+++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir
@@ -466,4 +466,91 @@ func.func @_QPlogical_cst() {
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DESC]] : (!fir.ref<!fir.box<!fir.logical<4>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFDataTransferCstDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 
+func.func @_QPcallkernel(%arg0: !fir.box<!fir.array<?x?xcomplex<f32>>> {fir.bindc_name = "a"}, %arg1: !fir.ref<f32> {fir.bindc_name = "b"}, %arg2: !fir.ref<f32> {fir.bindc_name = "c"}) {
+  %c0_i64 = arith.constant 0 : i64
+  %c1_i32 = arith.constant 1 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFcallkernelEa"} : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.dscope) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
+  %2 = fir.rebox %1 : (!fir.box<!fir.array<?x?xcomplex<f32>>>) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
+  %3 = cuf.alloc !fir.box<!fir.heap<!fir.array<?x?xcomplex<f32>>>> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QFcallkernelEadev"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xcomplex<f32>>>>>
+  %7 = fir.declare %3 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFcallkernelEadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xcomplex<f32>>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xcomplex<f32>>>>>
+  %8 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFcallkernelEb"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  %9 = fir.declare %arg2 dummy_scope %0 {uniq_name = "_QFcallkernelEc"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  %10 = fir.alloca i32 {bindc_name = "m", uniq_name = "_QFcallkernelEm"}
+  %11 = fir.declare %10 {uniq_name = "_QFcallkernelEm"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %12 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFcallkernelEn"}
+  %13 = fir.declare %12 {uniq_name = "_QFcallkernelEn"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %14:3 = fir.box_dims %2, %c0 : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+  %15 = fir.convert %14#1 : (index) -> i32
+  fir.store %15 to %13 : !fir.ref<i32>
+  %16:3 = fir.box_dims %2, %c1 : (!fir.box<!fir.array<?x?xcomplex<f32>>>, index) -> (index, index, index)
+  %27 = fir.load %13 : !fir.ref<i32>
+  %28 = fir.convert %27 : (i32) -> index
+  %29 = arith.cmpi sgt, %28, %c0 : index
+  %30 = arith.select %29, %28, %c0 : index
+  %31 = fir.load %11 : !fir.ref<i32>
+  %32 = fir.convert %31 : (i32) -> index
+  %33 = arith.cmpi sgt, %32, %c0 : index
+  %34 = arith.select %33, %32, %c0 : index
+  %35 = fir.shape %30, %34 : (index, index) -> !fir.shape<2>
+  %36 = fir.undefined index
+  %37 = fir.slice %c1, %28, %c1, %c1, %32, %c1 : (index, index, index, index, index, index) -> !fir.slice<2>
+  %38 = fir.rebox %2 [%37] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
+  cuf.data_transfer %38 to %7 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xcomplex<f32>>>>>
+  return
+}
+
+// CHECK-LABEL: func.func @_QPcallkernel(
+// CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xcomplex<f32>>> {fir.bindc_name = "a"}
+// CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?x?xcomplex<f32>>>
+// CHECK: %[[DECL_ARG0:.*]] = fir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFcallkernelEa"} : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.dscope) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
+// CHECK: %[[REBOX0:.*]] = fir.rebox %[[DECL_ARG0]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
+// CHECK: %[[REBOX1:.*]] = fir.rebox %[[REBOX0]] [%{{.*}}] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x?xcomplex<f32>>>
+// CHECK: fir.store %[[REBOX1]] to %[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?x?xcomplex<f32>>>>
+// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ALLOCA]] : (!fir.ref<!fir.box<!fir.array<?x?xcomplex<f32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+
+func.func @_QPsrc_cst() {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "d4", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub4Ed4"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  %5:2 = hlfir.declare %1 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ed4"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %6 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsub4Ei"}
+  %7:2 = hlfir.declare %6 {uniq_name = "_QFsub4Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c1 = arith.constant 1 : index
+  %c10_i32 = arith.constant 10 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %9 = fir.convert %5#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %c6_i32 = arith.constant 6 : i32
+  %14 = fir.convert %c6_i32 : (i32) -> index
+  %c10_i32_0 = arith.constant 10 : i32
+  %15 = fir.convert %c10_i32_0 : (i32) -> index
+  %c1_1 = arith.constant 1 : index
+  %16 = fir.convert %14 : (index) -> i32
+  %17:2 = fir.do_loop %arg1 = %14 to %15 step %c1_1 iter_args(%arg2 = %16) -> (index, i32) {
+    fir.store %arg2 to %7#1 : !fir.ref<i32>
+    %cst = arith.constant -4.000000e+00 : f32
+    %22 = fir.load %5#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+    %23 = fir.load %7#0 : !fir.ref<i32>
+    %24 = fir.convert %23 : (i32) -> i64
+    %25 = hlfir.designate %22 (%24)  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, i64) -> !fir.ref<f32>
+    cuf.data_transfer %cst to %25 {transfer_kind = #cuf.cuda_transfer<host_device>} : f32, !fir.ref<f32>
+    %26 = arith.addi %arg1, %c1_1 : index
+    %27 = fir.convert %c1_1 : (index) -> i32
+    %28 = fir.load %7#1 : !fir.ref<i32>
+    %29 = arith.addi %28, %27 : i32
+    fir.result %26, %29 : index, i32
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsrc_cst()
+// CHECK: %[[ALLOCA:.*]] = fir.alloca f32
+// CHECK: %[[CST:.*]] = arith.constant -4.000000e+00 : f32
+// CHECK: fir.store %[[CST]] to %[[ALLOCA]] : !fir.ref<f32>
+// CHECK: %[[CONV:.*]] = fir.convert %[[ALLOCA]] : (!fir.ref<f32>) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %[[CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+
 } // end of module
diff --git a/flang/test/Fir/CUDA/cuda-device-global.f90 b/flang/test/Fir/CUDA/cuda-device-global.f90
index c83a938d5af21..8cac643b27c34 100644
--- a/flang/test/Fir/CUDA/cuda-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-device-global.f90
@@ -5,9 +5,9 @@
 module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
   fir.global @_QMmtestsEn(dense<[3, 4, 5, 6, 7]> : tensor<5xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<5xi32>
 
-  gpu.module @cuda_device_mod [#nvvm.target] {
+  gpu.module @cuda_device_mod {
   }
 }
 
-// CHECK: gpu.module @cuda_device_mod [#nvvm.target] 
+// CHECK: gpu.module @cuda_device_mo
 // CHECK-NEXT: fir.global @_QMmtestsEn(dense<[3, 4, 5, 6, 7]> : tensor<5xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<5xi32>
diff --git a/flang/test/Fir/CUDA/cuda-extranal-mangling.mlir b/flang/test/Fir/CUDA/cuda-extranal-mangling.mlir
new file mode 100644
index 0000000000000..551a89a7018c2
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-extranal-mangling.mlir
@@ -0,0 +1,13 @@
+// RUN: fir-opt --split-input-file --external-name-interop %s | FileCheck %s
+
+gpu.module @cuda_device_mod {
+  gpu.func @_QPfoo() {
+    fir.call @_QPthreadfence() fastmath<contract> : () -> ()
+    gpu.return
+  }
+  func.func private @_QPthreadfence() attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
+}
+
+// CHECK-LABEL: gpu.func @_QPfoo
+// CHECK: fir.call @threadfence_()
+// CHECK: func.func private @threadfence_()
diff --git a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
index 18b56a491cd65..772e2696171a6 100644
--- a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
@@ -25,7 +25,7 @@ // Test that global used in device function are flagged with the correct
 // CHECK: fir.call @_FortranAioBeginExternalListOutput(%{{.*}}, %[[CONV]], %{{.*}}) fastmath<contract> : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 // CHECK: fir.global linkonce @_QQcl[[SYMBOL]] {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,32>
 
-// CHECK-LABEL: gpu.module @cuda_device_mod [#nvvm.target]
+// CHECK-LABEL: gpu.module @cuda_device_mod
 // CHECK: fir.global linkonce @_QQclX6995815537abaf90e86ce166af128f3a
 
 // -----
@@ -51,5 +51,96 @@ // Test that global used in device function are flagged with the correct
 // CHECK: fir.call @_FortranAioBeginExternalListOutput(%{{.*}}, %[[CONV]], %{{.*}}) fastmath<contract> : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 // CHECK: fir.global linkonce @_QQcl[[SYMBOL]] constant : !fir.char<1,32>
 
-// CHECK-LABEL: gpu.module @cuda_device_mod [#nvvm.target]
+// CHECK-LABEL: gpu.module @cuda_device_mod
 // CHECK-NOT: fir.global linkonce @_QQclX6995815537abaf90e86ce166af128f3a
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsub1Ei"}
+  %1:2 = hlfir.declare %0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c1_i32 = arith.constant 1 : i32
+  %2 = fir.convert %c1_i32 : (i32) -> index
+  %c100_i32 = arith.constant 100 : i32
+  %3 = fir.convert %c100_i32 : (i32) -> index
+  %c1 = arith.constant 1 : index
+  cuf.kernel<<<*, *>>> (%arg0 : index) = (%2 : index) to (%3 : index)  step (%c1 : index) {
+    %4 = fir.convert %arg0 : (index) -> i32
+    fir.store %4 to %1#1 : !fir.ref<i32>
+    %5 = fir.load %1#0 : !fir.ref<i32>
+    %c1_i32_0 = arith.constant 1 : i32
+    %6 = arith.cmpi eq, %5, %c1_i32_0 : i32
+    fir.if %6 {
+      %c6_i32 = arith.constant 6 : i32
+      %7 = fir.address_of(@_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5) : !fir.ref<!fir.char<1,10>>
+      %8 = fir.convert %7 : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<i8>
+      %c5_i32 = arith.constant 5 : i32
+      %9 = fir.call @_FortranAioBeginExternalListOutput(%c6_i32, %8, %c5_i32) fastmath<contract> : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+      %10 = fir.address_of(@_QQclX5465737420504153534544) : !fir.ref<!fir.char<1,11>>
+      %c11 = arith.constant 11 : index
+      %11:2 = hlfir.declare %10 typeparams %c11 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX5465737420504153534544"} : (!fir.ref<!fir.char<1,11>>, index) -> (!fir.ref<!fir.char<1,11>>, !fir.ref<!fir.char<1,11>>)
+      %12 = fir.convert %11#1 : (!fir.ref<!fir.char<1,11>>) -> !fir.ref<i8>
+      %13 = fir.convert %c11 : (index) -> i64
+      %14 = fir.call @_FortranAioOutputAscii(%9, %12, %13) fastmath<contract> : (!fir.ref<i8>, !fir.ref<i8>, i64) -> i1
+      %15 = fir.call @_FortranAioEndIoStatement(%9) fastmath<contract> : (!fir.ref<i8>) -> i32
+    }
+    "fir.end"() : () -> ()
+  }
+  return
+}
+func.func private @_FortranAioBeginExternalListOutput(i32, !fir.ref<i8>, i32) -> !fir.ref<i8> attributes {fir.io, fir.runtime}
+fir.global linkonce @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 constant : !fir.char<1,10> {
+  %0 = fir.string_lit "dummy.cuf\00"(10) : !fir.char<1,10>
+  fir.has_value %0 : !fir.char<1,10>
+}
+func.func private @_FortranAioOutputAscii(!fir.ref<i8>, !fir.ref<i8>, i64) -> i1 attributes {fir.io, fir.runtime}
+fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11> {
+  %0 = fir.string_lit "Test PASSED"(11) : !fir.char<1,11>
+  fir.has_value %0 : !fir.char<1,11>
+}
+
+// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,11>
+
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant
+
+// -----
+
+func.func @_QPsub1() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+  %6 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsub1Ei"}
+  %7:2 = hlfir.declare %6 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %12 = fir.load %7#0 : !fir.ref<i32>
+  %c1_i32 = arith.constant 1 : i32
+  %13 = arith.cmpi eq, %12, %c1_i32 : i32
+  fir.if %13 {
+    %c6_i32 = arith.constant 6 : i32
+    %14 = fir.address_of(@_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5) : !fir.ref<!fir.char<1,10>>
+    %15 = fir.convert %14 : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<i8>
+    %c3_i32 = arith.constant 3 : i32
+    %16 = fir.call @_FortranAioBeginExternalListOutput(%c6_i32, %15, %c3_i32) fastmath<contract> : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+    %17 = fir.address_of(@_QQclX5465737420504153534544) : !fir.ref<!fir.char<1,11>>
+    %c11 = arith.constant 11 : index
+    %18:2 = hlfir.declare %17 typeparams %c11 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX5465737420504153534544"} : (!fir.ref<!fir.char<1,11>>, index) -> (!fir.ref<!fir.char<1,11>>, !fir.ref<!fir.char<1,11>>)
+    %19 = fir.convert %18#1 : (!fir.ref<!fir.char<1,11>>) -> !fir.ref<i8>
+    %20 = fir.convert %c11 : (index) -> i64
+    %21 = fir.call @_FortranAioOutputAscii(%16, %19, %20) fastmath<contract> : (!fir.ref<i8>, !fir.ref<i8>, i64) -> i1
+    %22 = fir.call @_FortranAioEndIoStatement(%16) fastmath<contract> : (!fir.ref<i8>) -> i32
+  }
+  return
+}
+func.func private @_FortranAioBeginExternalListOutput(i32, !fir.ref<i8>, i32) -> !fir.ref<i8> attributes {fir.io, fir.runtime}
+fir.global linkonce @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 constant : !fir.char<1,10> {
+  %0 = fir.string_lit "dummy.cuf\00"(10) : !fir.char<1,10>
+  fir.has_value %0 : !fir.char<1,10>
+}
+func.func private @_FortranAioOutputAscii(!fir.ref<i8>, !fir.ref<i8>, i64) -> i1 attributes {fir.io, fir.runtime}
+fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11> {
+  %0 = fir.string_lit "Test PASSED"(11) : !fir.char<1,11>
+  fir.has_value %0 : !fir.char<1,11>
+}
+func.func private @_FortranAioEndIoStatement(!fir.ref<i8>) -> i32 attributes {fir.io, fir.runtime}
+
+// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,11>
+
+// CHECK-LABEL: gpu.module @cuda_device_mod 
+// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index bca454c13ff9c..4b18acb7c2b43 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -47,6 +47,7 @@ func.func @_QQmain() {
 // PASSES-NEXT:   LowerHLFIRIntrinsics
 // PASSES-NEXT:   BufferizeHLFIR
 // PASSES-NEXT:   ConvertHLFIRtoFIR
+// PASSES-NEXT:   LowerWorkshare
 // PASSES-NEXT:   CSE
 // PASSES-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
diff --git a/flang/test/Fir/target-rewrite-integer-loongarch64.fir b/flang/test/Fir/target-rewrite-integer-loongarch64.fir
new file mode 100644
index 0000000000000..8421cbbb41a9d
--- /dev/null
+++ b/flang/test/Fir/target-rewrite-integer-loongarch64.fir
@@ -0,0 +1,27 @@
+/// Test i32 passing and returning on LoongArch64
+/// LoongArch64 LP64D ABI requires unsigned 32 bit integers to be sign extended.
+
+// REQUIRES: loongarch-registered-target
+// RUN: fir-opt --target-rewrite="target=loongarch64-unknown-linux-gnu" %s | FileCheck %s --check-prefix=LOONGARCH64
+// RUN: tco -target="loongarch64-unknown-linux-gnu" %s | FileCheck %s --check-prefix=LOONGARCH64_LLVM
+
+// LOONGARCH64: func.func private @cfunc32(i32 {llvm.signext}) -> (i32 {llvm.signext}) attributes {fir.bindc_name = "cfunc32"}
+
+// LOONGARCH64_LLVM: declare signext i32 @cfunc32(i32 signext)
+func.func private @cfunc32(i32) -> i32 attributes {fir.bindc_name = "cfunc32"}
+
+// LOONGARCH64-LABEL: func.func @foo(
+// LOONGARCH64-SAME: %[[VAL_0:.*]]: i32 {llvm.signext}) -> (i32 {llvm.signext}) attributes {fir.bindc_name = "foo"} {
+// LOONGARCH64: %[[VAL_1:.*]] = fir.call @cfunc32(%[[VAL_0]]) fastmath<contract> : (i32) -> i32
+// LOONGARCH64: return %[[VAL_1]] : i32
+// LOONGARCH64: }
+
+// LOONGARCH64_LLVM-LABEL: define signext i32 @foo(
+// LOONGARCH64_LLVM: i32 signext %[[VAL_0:.*]]) {
+// LOONGARCH64_LLVM: %[[VAL_1:.*]] = call i32 @cfunc32(i32 %[[VAL_0]])
+// LOONGARCH64_LLVM: ret i32 %[[VAL_1]]
+// LOONGARCH64_LLVM: }
+func.func @foo(%0: i32) -> i32 attributes {fir.bindc_name = "foo"} {
+  %1 = fir.call @cfunc32(%0) fastmath<contract> : (i32) -> i32
+  return %1 : i32
+}
diff --git a/flang/test/Lower/OpenMP/Todo/map-mapper.f90 b/flang/test/Lower/OpenMP/Todo/map-mapper.f90
new file mode 100644
index 0000000000000..d83c20db29307
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/map-mapper.f90
@@ -0,0 +1,16 @@
+! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s
+program p
+  integer, parameter :: n = 256
+  real(8) :: a(256)
+  !! TODO: Add declare mapper, when it works to lower this construct 
+  !!type t1
+  !!   integer :: x
+  !!end type t1
+  !!!$omp declare mapper(xx : t1 :: nn) map(nn, nn%x)
+  !$omp target map(mapper(xx), from:a)
+!CHECK: not yet implemented: OmpMapClause(MAPPER(...))
+  do i=1,n
+     a(i) = 4.2
+  end do
+  !$omp end target
+end program p
diff --git a/flang/test/Lower/OpenMP/Todo/scope-allocate.f90 b/flang/test/Lower/OpenMP/Todo/scope-allocate.f90
new file mode 100644
index 0000000000000..5a834c81a852c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/scope-allocate.f90
@@ -0,0 +1,12 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=52 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Scope construct
+program omp_scope
+  integer i
+  i = 10
+
+  !$omp scope allocate(x) private(x)
+  print *, "omp scope", i
+  !$omp end scope
+
+end program omp_scope
diff --git a/flang/test/Lower/OpenMP/Todo/scope-firstprivate.f90 b/flang/test/Lower/OpenMP/Todo/scope-firstprivate.f90
new file mode 100644
index 0000000000000..87bcecb817da4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/scope-firstprivate.f90
@@ -0,0 +1,12 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=52 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Scope construct
+program omp_scope
+  integer i
+  i = 10
+
+  !$omp scope firstprivate(x)
+  print *, "omp scope", i
+  !$omp end scope
+
+end program omp_scope
diff --git a/flang/test/Lower/OpenMP/workshare.f90 b/flang/test/Lower/OpenMP/workshare.f90
index 1e11677a15e1f..8e771952f5b6d 100644
--- a/flang/test/Lower/OpenMP/workshare.f90
+++ b/flang/test/Lower/OpenMP/workshare.f90
@@ -6,7 +6,7 @@ subroutine sb1(arr)
   integer :: arr(:)
 !CHECK: omp.parallel  {
   !$omp parallel
-!CHECK: omp.single  {
+!CHECK: omp.workshare {
   !$omp workshare
     arr = 0
   !$omp end workshare
@@ -20,7 +20,7 @@ subroutine sb2(arr)
   integer :: arr(:)
 !CHECK: omp.parallel  {
   !$omp parallel
-!CHECK: omp.single nowait {
+!CHECK: omp.workshare nowait {
   !$omp workshare
     arr = 0
   !$omp end workshare nowait
@@ -33,7 +33,7 @@ subroutine sb2(arr)
 subroutine sb3(arr)
   integer :: arr(:)
 !CHECK: omp.parallel  {
-!CHECK: omp.single  {
+!CHECK: omp.workshare  {
   !$omp parallel workshare
     arr = 0
   !$omp end parallel workshare
diff --git a/flang/test/Parser/OpenMP/affinity-clause.f90 b/flang/test/Parser/OpenMP/affinity-clause.f90
index 804723cad7b2b..5e9e0a2194bab 100644
--- a/flang/test/Parser/OpenMP/affinity-clause.f90
+++ b/flang/test/Parser/OpenMP/affinity-clause.f90
@@ -63,7 +63,7 @@ subroutine f02(x)
 !PARSE-TREE: OmpBeginBlockDirective
 !PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = task
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Affinity -> OmpAffinityClause
-!PARSE-TREE: | | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | | TypeDeclarationStmt
 !PARSE-TREE: | | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | | EntityDecl
diff --git a/flang/test/Parser/OpenMP/depobj-construct.f90 b/flang/test/Parser/OpenMP/depobj-construct.f90
index 3de190c95bb73..51726a5adf99e 100644
--- a/flang/test/Parser/OpenMP/depobj-construct.f90
+++ b/flang/test/Parser/OpenMP/depobj-construct.f90
@@ -15,7 +15,7 @@ subroutine f00
 !PARSE-TREE: | Verbatim
 !PARSE-TREE: | OmpObject -> Designator -> DataRef -> Name = 'x'
 !PARSE-TREE: | OmpClause -> Depend -> OmpDependClause -> TaskDep
-!PARSE-TREE: | | OmpTaskDependenceType -> Type = In
+!PARSE-TREE: | | OmpTaskDependenceType -> Value = In
 !PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'y'
 
 subroutine f01
@@ -31,7 +31,7 @@ subroutine f01
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPDepobjConstruct
 !PARSE-TREE: | Verbatim
 !PARSE-TREE: | OmpObject -> Designator -> DataRef -> Name = 'x'
-!PARSE-TREE: | OmpClause -> Update -> OmpUpdateClause -> OmpTaskDependenceType -> Type = Out
+!PARSE-TREE: | OmpClause -> Update -> OmpUpdateClause -> OmpTaskDependenceType -> Value = Out
 
 subroutine f02
   integer :: x
diff --git a/flang/test/Parser/OpenMP/from-clause.f90 b/flang/test/Parser/OpenMP/from-clause.f90
index 1dcca0b611dfb..cff9c077c0a94 100644
--- a/flang/test/Parser/OpenMP/from-clause.f90
+++ b/flang/test/Parser/OpenMP/from-clause.f90
@@ -45,7 +45,7 @@ subroutine f02(x)
 !PARSE-TREE: OmpSimpleStandaloneDirective -> llvm::omp::Directive = target update
 !PARSE-TREE: OmpClauseList -> OmpClause -> From -> OmpFromClause
 !PARSE-TREE: | Expectation = Present
-!PARSE-TREE: | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | TypeDeclarationStmt
 !PARSE-TREE: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | EntityDecl
@@ -74,7 +74,7 @@ subroutine f03(x)
 !PARSE-TREE: OmpSimpleStandaloneDirective -> llvm::omp::Directive = target update
 !PARSE-TREE: OmpClauseList -> OmpClause -> From -> OmpFromClause
 !PARSE-TREE: | Expectation = Present
-!PARSE-TREE: | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | TypeDeclarationStmt
 !PARSE-TREE: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | EntityDecl
diff --git a/flang/test/Parser/OpenMP/in-reduction-clause.f90 b/flang/test/Parser/OpenMP/in-reduction-clause.f90
index 776ead3824b71..ab26ca2d9300f 100644
--- a/flang/test/Parser/OpenMP/in-reduction-clause.f90
+++ b/flang/test/Parser/OpenMP/in-reduction-clause.f90
@@ -37,14 +37,14 @@ end subroutine omp_in_reduction_taskgroup
 !PARSE-TREE-NEXT: OmpBeginBlockDirective
 !PARSE-TREE-NEXT: OmpBlockDirective -> llvm::omp::Directive = task
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> InReduction -> OmpInReductionClause
-!PARSE-TREE-NEXT: OmpReductionOperator -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE-NEXT: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'z'
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
 !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = taskloop
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> InReduction -> OmpInReductionClause
-!PARSE-TREE-NEXT: OmpReductionOperator -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE-NEXT: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'z'
 
 subroutine omp_in_reduction_parallel()
@@ -74,6 +74,6 @@ end subroutine omp_in_reduction_parallel
 !PARSE-TREE-NEXT: OmpBeginLoopDirective
 !PARSE-TREE-NEXT: OmpLoopDirective -> llvm::omp::Directive = taskloop simd
 !PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> InReduction -> OmpInReductionClause
-!PARSE-TREE-NEXT: OmpReductionOperator -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE-NEXT: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PASRE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'z'
 
diff --git a/flang/test/Parser/OpenMP/map-modifiers.f90 b/flang/test/Parser/OpenMP/map-modifiers.f90
index 0c95f21c5e6a5..578512283c4dc 100644
--- a/flang/test/Parser/OpenMP/map-modifiers.f90
+++ b/flang/test/Parser/OpenMP/map-modifiers.f90
@@ -159,7 +159,7 @@ subroutine f10(x)
 !PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
 !PARSE-TREE: | | TypeModifier = Present
-!PARSE-TREE: | | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | | TypeDeclarationStmt
 !PARSE-TREE: | | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | | EntityDecl
@@ -194,7 +194,7 @@ subroutine f11(x)
 !PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
 !PARSE-TREE: | | TypeModifier = Present
-!PARSE-TREE: | | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | | TypeDeclarationStmt
 !PARSE-TREE: | | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | | EntityDecl
@@ -229,7 +229,7 @@ subroutine f12(x)
 !PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
 !PARSE-TREE: | | TypeModifier = Present
-!PARSE-TREE: | | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | | TypeDeclarationStmt
 !PARSE-TREE: | | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | | EntityDecl
@@ -287,7 +287,7 @@ subroutine f90(x, y)
 !PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
 !PARSE-TREE: | | TypeModifier = Present
-!PARSE-TREE: | | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | | TypeDeclarationStmt
 !PARSE-TREE: | | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | | EntityDecl
@@ -316,3 +316,28 @@ subroutine f90(x, y)
 !PARSE-TREE: | | | | Designator -> DataRef -> Name = 'k'
 !PARSE-TREE: | | bool = 'true'
 
+subroutine f100(x, y)
+  integer :: x(10)
+  integer :: y
+  integer, parameter :: p = 23
+  !$omp target map(mapper(xx), from: x)
+  x = x + 1
+  !$omp end target
+end
+
+!UNPARSE: SUBROUTINE f100 (x, y)
+!UNPARSE:  INTEGER x(10_4)
+!UNPARSE:  INTEGER y
+!UNPARSE:  INTEGER, PARAMETER :: p = 23_4
+!UNPARSE: !$OMP TARGET  MAP(MAPPER(XX), FROM: X)
+!UNPARSE:   x=x+1_4
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginBlockDirective
+!PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | OmpMapperIdentifier -> Name = 'xx'
+!PARSE-TREE: | | Type = From
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+
diff --git a/flang/test/Parser/OpenMP/reduction-modifier.f90 b/flang/test/Parser/OpenMP/reduction-modifier.f90
index d46aa70959592..4bba23bcf0611 100644
--- a/flang/test/Parser/OpenMP/reduction-modifier.f90
+++ b/flang/test/Parser/OpenMP/reduction-modifier.f90
@@ -10,7 +10,7 @@ subroutine foo()
 ! PARSE-TREE: | | | | OmpLoopDirective -> llvm::omp::Directive = do
 ! PARSE-TREE: | | | | OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
 ! PARSE-TREE: | | | | | ReductionModifier = Task
-! PARSE-TREE: | | | | | OmpReductionOperator -> DefinedOperator -> IntrinsicOperator = Multiply
+! PARSE-TREE: | | | | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
 ! PARSE-TREE: | | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j
   !$omp do reduction (task, *: j)
   do i = 1, 10
diff --git a/flang/test/Parser/OpenMP/target-update-to-clause.f90 b/flang/test/Parser/OpenMP/target-update-to-clause.f90
index 2702575847924..bb57270fc0bf9 100644
--- a/flang/test/Parser/OpenMP/target-update-to-clause.f90
+++ b/flang/test/Parser/OpenMP/target-update-to-clause.f90
@@ -45,7 +45,7 @@ subroutine f02(x)
 !PARSE-TREE: OmpSimpleStandaloneDirective -> llvm::omp::Directive = target update
 !PARSE-TREE: OmpClauseList -> OmpClause -> To -> OmpToClause
 !PARSE-TREE: | Expectation = Present
-!PARSE-TREE: | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | TypeDeclarationStmt
 !PARSE-TREE: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | EntityDecl
@@ -74,7 +74,7 @@ subroutine f03(x)
 !PARSE-TREE: OmpSimpleStandaloneDirective -> llvm::omp::Directive = target update
 !PARSE-TREE: OmpClauseList -> OmpClause -> To -> OmpToClause
 !PARSE-TREE: | Expectation = Present
-!PARSE-TREE: | OmpIteratorModifier -> OmpIteratorSpecifier
+!PARSE-TREE: | OmpIterator -> OmpIteratorSpecifier
 !PARSE-TREE: | | TypeDeclarationStmt
 !PARSE-TREE: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
 !PARSE-TREE: | | | EntityDecl
diff --git a/flang/test/Semantics/OpenMP/map-clause-symbols.f90 b/flang/test/Semantics/OpenMP/map-clause-symbols.f90
new file mode 100644
index 0000000000000..8f984fcd2fa7e
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/map-clause-symbols.f90
@@ -0,0 +1,14 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
+program main
+!CHECK-LABEL:  MainProgram scope: main
+  integer, parameter :: n = 256
+  real(8) :: a(256)
+  !$omp target map(mapper(xx), from:a)
+  do i=1,n
+     a(i) = 4.2
+  end do
+  !$omp end target
+!CHECK:    OtherConstruct scope: size=0 alignment=1 sourceRange=74 bytes
+!CHECK:    OtherClause scope: size=0 alignment=1 sourceRange=0 bytes
+!CHECK:    xx: Misc ConstructName
+end program main
diff --git a/flang/test/Semantics/OpenMP/map-clause.f90 b/flang/test/Semantics/OpenMP/map-clause.f90
index a7430c3edeb94..efcef2571a04a 100644
--- a/flang/test/Semantics/OpenMP/map-clause.f90
+++ b/flang/test/Semantics/OpenMP/map-clause.f90
@@ -33,3 +33,11 @@ subroutine sb(arr)
    c = 2
  !$omp end target
 end subroutine
+
+subroutine sb1
+  integer :: xx
+  integer :: a
+  !ERROR: Name 'xx' should be a mapper name
+  !$omp target map(mapper(xx), from:a)
+  !$omp end target
+end subroutine sb1
diff --git a/flang/test/Semantics/io03.f90 b/flang/test/Semantics/io03.f90
index 9cd672ee01696..6c05924f09dce 100644
--- a/flang/test/Semantics/io03.f90
+++ b/flang/test/Semantics/io03.f90
@@ -58,6 +58,13 @@
   read(internal_file2, *) jj
   read(internal_file4, *) jj
 
+  !This is a valid statement but it's not what it looks like; "(internal-file)"
+  !must be parsed as a format expression, not as an internal unit.
+  read(internal_file) jj
+
+  !ERROR: If UNIT=internal-file appears, FMT or NML must also appear
+  read(internal_file, iostat=stat1) jj
+
   !ERROR: Internal file must not have a vector subscript
   read(internal_fileA(vv), *) jj
 
@@ -106,11 +113,12 @@
   !ERROR: If UNIT=* appears, POS must not appear
   read(*, pos=13)
 
+  !ERROR: If UNIT=internal-file appears, FMT or NML must also appear
   !ERROR: If UNIT=internal-file appears, REC must not appear
   read(internal_file, rec=13)
 
   !ERROR: If UNIT=internal-file appears, POS must not appear
-  read(internal_file, pos=13)
+  read(internal_file, *, pos=13)
 
   !ERROR: If REC appears, END must not appear
   read(10, fmt='(I4)', end=9, rec=13) jj
@@ -135,7 +143,7 @@
   read(*, asynchronous='yes')
 
   !ERROR: If ASYNCHRONOUS='YES' appears, UNIT=number must also appear
-  read(internal_file, asynchronous='y'//'es')
+  read(internal_file, *, asynchronous='y'//'es')
 
   !ERROR: If ID appears, ASYNCHRONOUS='YES' must also appear
   read(10, id=id)
diff --git a/flang/test/Semantics/io04.f90 b/flang/test/Semantics/io04.f90
index 685e43dd6e401..7114f14a9488a 100644
--- a/flang/test/Semantics/io04.f90
+++ b/flang/test/Semantics/io04.f90
@@ -34,6 +34,7 @@
   write(unit=10) 'Ok'
   write(*, nnn)
   write(10, nnn)
+  !ERROR: If UNIT=internal-file appears, FMT or NML must also appear
   write(internal_file)
   write(internal_file, *)
   write(internal_file, fmt=*)
@@ -55,7 +56,7 @@
   allocate(a(8), stat=stat8)
 
   !ERROR: Duplicate UNIT specifier
-  write(internal_file, unit=*)
+  write(internal_file, unit=*, fmt=*)
 
   !ERROR: WRITE statement must have a UNIT specifier
   write(nml=nnn)
@@ -84,6 +85,7 @@
   !ERROR: If UNIT=internal-file appears, POS must not appear
   write(internal_file, err=9, pos=n, nml=nnn)
 
+  !ERROR: If UNIT=internal-file appears, FMT or NML must also appear
   !ERROR: If UNIT=internal-file appears, REC must not appear
   write(internal_file, rec=n, err=9)
 
@@ -106,7 +108,7 @@
   write(*, asynchronous='yes')
 
   !ERROR: If ASYNCHRONOUS='YES' appears, UNIT=number must also appear
-  write(internal_file, asynchronous='yes')
+  write(internal_file, *, asynchronous='yes')
 
   !ERROR: If ID appears, ASYNCHRONOUS='YES' must also appear
   write(10, *, id=id) "Ok"
diff --git a/flang/test/Semantics/pointer02.f90 b/flang/test/Semantics/pointer02.f90
new file mode 100644
index 0000000000000..90bb435855939
--- /dev/null
+++ b/flang/test/Semantics/pointer02.f90
@@ -0,0 +1,53 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+recursive subroutine sub(dp, dpp)
+  procedure(inner) dp
+  procedure(inner), pointer :: dpp
+  procedure(inner) ext
+  procedure(sub), pointer :: p1 => sub ! ok
+  procedure(inner), pointer :: p2 => ext ! ok
+  !ERROR: Procedure pointer 'p3' initializer 'inner' is neither an external nor a module procedure
+  procedure(inner), pointer :: p3 => inner
+  !ERROR: Procedure pointer 'p4' initializer 'dp' is neither an external nor a module procedure
+  procedure(inner), pointer :: p4 => dp
+  !ERROR: Procedure pointer 'p5' initializer 'dpp' is neither an external nor a module procedure
+  procedure(inner), pointer :: p5 => dpp
+  generic :: generic => ext
+  !ERROR: 'generic' must be an abstract interface or a procedure with an explicit interface
+  procedure(generic), pointer :: p6 ! => generic
+ contains
+  subroutine inner
+  end
+end
+recursive function fun() result(res)
+  procedure(fun), pointer :: p1 => fun ! ok
+  !ERROR: Procedure pointer 'p2' initializer 'inner' is neither an external nor a module procedure
+  procedure(inner), pointer :: p2 => inner
+  res = 0.
+ contains
+  function inner()
+    inner = 0.
+  end
+end
+module m
+  procedure(msub), pointer :: ps1 => msub ! ok
+  procedure(mfun), pointer :: pf1 => mfun ! ok
+ contains
+  recursive subroutine msub
+    procedure(msub), pointer :: ps2 => msub ! ok
+    !ERROR: Procedure pointer 'ps3' initializer 'inner' is neither an external nor a module procedure
+    procedure(inner), pointer :: ps3 => inner
+   contains
+    subroutine inner
+    end
+  end
+  recursive function mfun() result(res)
+    procedure(mfun), pointer :: pf2 => mfun ! ok
+    !ERROR: Procedure pointer 'pf3' initializer 'inner' is neither an external nor a module procedure
+    procedure(inner), pointer :: pf3 => inner
+    res = 0.
+   contains
+    function inner()
+      inner = 0.
+    end
+  end
+end
diff --git a/flang/test/Semantics/smp-def02.f90 b/flang/test/Semantics/smp-def02.f90
new file mode 100644
index 0000000000000..ef27f14edae0a
--- /dev/null
+++ b/flang/test/Semantics/smp-def02.f90
@@ -0,0 +1,42 @@
+!RUN: %flang -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
+!Ensure no bogus error messages about insufficiently defined procedures
+!CHECK-NOT: error
+
+module m
+  interface
+    module subroutine smp1(a1)
+    end
+  end interface
+end
+
+submodule(m) sm1
+  interface
+    module subroutine smp2(a1,a2)
+    end
+  end interface
+end
+
+submodule(m:sm1) sm2
+  interface generic
+    procedure smp1
+    procedure smp2
+    module subroutine smp3(a1,a2,a3)
+    end
+  end interface
+ contains
+  subroutine local1
+    call generic(0.)
+    call generic(0., 1.)
+    call generic(0., 1., 2.)
+  end
+  subroutine local2(a1,a2,a3)
+  end
+  module procedure smp1
+  end
+  module subroutine smp2(a1,a2)
+  end
+  module subroutine smp3(a1,a2,a3)
+  end
+end
+
+
diff --git a/flang/test/Semantics/undef-result01.f90 b/flang/test/Semantics/undef-result01.f90
index 08e7fe1e44899..e1ae58dae7c0a 100644
--- a/flang/test/Semantics/undef-result01.f90
+++ b/flang/test/Semantics/undef-result01.f90
@@ -117,7 +117,7 @@ function defdByNamelist()
 end
 
 character(4) function defdByWrite()
-  write(defdByWrite) 'abcd'
+  write(defdByWrite,*) 'abcd'
 end
 
 integer function defdBySize()
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir b/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir
new file mode 100644
index 0000000000000..12b0558d06ed5
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir
@@ -0,0 +1,53 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Checks that fir.alloca is hoisted out and copyprivate'd
+func.func @wsfunc() {
+  omp.workshare {
+    %c1 = arith.constant 1 : index
+    %c42 = arith.constant 42 : index
+    %c1_i32 = arith.constant 1 : i32
+    %alloc = fir.alloca i32
+    fir.store %c1_i32 to %alloc : !fir.ref<i32>
+    omp.workshare.loop_wrapper {
+      omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) {
+        "test.test1"(%alloc) : (!fir.ref<i32>) -> ()
+        omp.yield
+      }
+    }
+    "test.test2"(%alloc) : (!fir.ref<i32>) -> ()
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL:   func.func private @_workshare_copy_i32(
+// CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<i32>,
+// CHECK-SAME:                                           %[[VAL_1:.*]]: !fir.ref<i32>) {
+// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+// CHECK:           fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @wsfunc() {
+// CHECK:           %[[VAL_0:.*]] = fir.alloca i32
+// CHECK:           omp.single copyprivate(%[[VAL_0]] -> @_workshare_copy_i32 : !fir.ref<i32>) {
+// CHECK:             %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:             fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref<i32>
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 42 : index
+// CHECK:           omp.wsloop {
+// CHECK:             omp.loop_nest (%[[VAL_4:.*]]) : index = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_2]]) {
+// CHECK:               "test.test1"(%[[VAL_0]]) : (!fir.ref<i32>) -> ()
+// CHECK:               omp.yield
+// CHECK:             }
+// CHECK:           }
+// CHECK:           omp.single nowait {
+// CHECK:             "test.test2"(%[[VAL_0]]) : (!fir.ref<i32>) -> ()
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           omp.barrier
+// CHECK:           return
+// CHECK:         }
+
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir b/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir
new file mode 100644
index 0000000000000..f1d0e8e229614
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir
@@ -0,0 +1,49 @@
+// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Checks that the omp.workshare.loop_wrapper binds to the correct omp.workshare
+
+func.func @wsfunc() {
+  %c1 = arith.constant 1 : index
+  %c42 = arith.constant 42 : index
+  omp.parallel {
+    omp.workshare nowait {
+      omp.parallel {
+        omp.workshare nowait {
+          omp.workshare.loop_wrapper {
+            omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) {
+              "test.test2"() : () -> ()
+              omp.yield
+            }
+          }
+          omp.terminator
+        }
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL:   func.func @wsfunc() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 42 : index
+// CHECK:           omp.parallel {
+// CHECK:             omp.single nowait {
+// CHECK:               omp.parallel {
+// CHECK:                 omp.wsloop nowait {
+// CHECK:                   omp.loop_nest (%[[VAL_2:.*]]) : index = (%[[VAL_0]]) to (%[[VAL_1]]) inclusive step (%[[VAL_0]]) {
+// CHECK:                     "test.test2"() : () -> ()
+// CHECK:                     omp.yield
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:                 omp.terminator
+// CHECK:               }
+// CHECK:               omp.terminator
+// CHECK:             }
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir b/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir
new file mode 100644
index 0000000000000..ca288917a3ac4
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir
@@ -0,0 +1,57 @@
+// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Check that we cleanup unused pure operations from the parallel and single
+// regions
+
+// CHECK-LABEL:   func.func @wsfunc() {
+// CHECK:           %[[VAL_0:.*]] = fir.alloca i32
+// CHECK:           omp.parallel {
+// CHECK:             omp.single {
+// CHECK:               %[[VAL_1:.*]] = "test.test1"() : () -> i32
+// CHECK:               %[[VAL_2:.*]] = arith.constant 2 : index
+// CHECK:               %[[VAL_3:.*]] = arith.constant 3 : index
+// CHECK:               %[[VAL_4:.*]] = arith.addi %[[VAL_2]], %[[VAL_3]] : index
+// CHECK:               "test.test3"(%[[VAL_4]]) : (index) -> ()
+// CHECK:               omp.terminator
+// CHECK:             }
+// CHECK:             %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:             %[[VAL_6:.*]] = arith.constant 42 : index
+// CHECK:             omp.wsloop nowait {
+// CHECK:               omp.loop_nest (%[[VAL_7:.*]]) : index = (%[[VAL_5]]) to (%[[VAL_6]]) inclusive step (%[[VAL_5]]) {
+// CHECK:                 "test.test2"() : () -> ()
+// CHECK:                 omp.yield
+// CHECK:               }
+// CHECK:             }
+// CHECK:             omp.barrier
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+func.func @wsfunc() {
+  %a = fir.alloca i32
+  omp.parallel {
+    omp.workshare {
+      %t1 = "test.test1"() : () -> i32
+
+      %c1 = arith.constant 1 : index
+      %c42 = arith.constant 42 : index
+
+      %c2 = arith.constant 2 : index
+      %c3 = arith.constant 3 : index
+      %add = arith.addi %c2, %c3 : index
+      "test.test3"(%add) : (index) -> ()
+
+      omp.workshare.loop_wrapper {
+        omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) {
+          "test.test2"() : () -> ()
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir b/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir
new file mode 100644
index 0000000000000..d7a04e198ceed
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir
@@ -0,0 +1,73 @@
+// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+
+// Check if we store the correct values
+
+func.func @wsfunc() {
+  omp.parallel {
+  // CHECK: fir.alloca
+  // CHECK: fir.alloca
+  // CHECK: fir.alloca
+  // CHECK: fir.alloca
+  // CHECK: fir.alloca
+  // CHECK-NOT: fir.alloca
+    omp.workshare {
+
+      %t1 = "test.test1"() : () -> i32
+      // CHECK: %[[T1:.*]] = "test.test1"
+      // CHECK: fir.store %[[T1]]
+      %t2 = "test.test2"() : () -> i32
+      // CHECK: %[[T2:.*]] = "test.test2"
+      // CHECK: fir.store %[[T2]]
+      %t3 = "test.test3"() : () -> i32
+      // CHECK: %[[T3:.*]] = "test.test3"
+      // CHECK-NOT: fir.store %[[T3]]
+      %t4 = "test.test4"() : () -> i32
+      // CHECK: %[[T4:.*]] = "test.test4"
+      // CHECK: fir.store %[[T4]]
+      %t5 = "test.test5"() : () -> i32
+      // CHECK: %[[T5:.*]] = "test.test5"
+      // CHECK: fir.store %[[T5]]
+      %t6 = "test.test6"() : () -> i32
+      // CHECK: %[[T6:.*]] = "test.test6"
+      // CHECK-NOT: fir.store %[[T6]]
+
+
+      "test.test1"(%t1) : (i32) -> ()
+      "test.test1"(%t2) : (i32) -> ()
+      "test.test1"(%t3) : (i32) -> ()
+
+      %true = arith.constant true
+      fir.if %true {
+        "test.test2"(%t3) : (i32) -> ()
+      }
+
+      %c1_i32 = arith.constant 1 : i32
+
+      %t5_pure_use = arith.addi %t5, %c1_i32 : i32
+
+      %t6_mem_effect_use = "test.test8"(%t6) : (i32) -> i32
+      // CHECK: %[[T6_USE:.*]] = "test.test8"
+      // CHECK: fir.store %[[T6_USE]]
+
+      %c42 = arith.constant 42 : index
+      %c1 = arith.constant 1 : index
+      omp.workshare.loop_wrapper {
+        omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) {
+          "test.test10"(%t1) : (i32) -> ()
+          "test.test10"(%t5_pure_use) : (i32) -> ()
+          "test.test10"(%t6_mem_effect_use) : (i32) -> ()
+          omp.yield
+        }
+      }
+
+      "test.test10"(%t2) : (i32) -> ()
+      fir.if %true {
+        "test.test10"(%t4) : (i32) -> ()
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir b/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir
new file mode 100644
index 0000000000000..31db8213b5f00
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir
@@ -0,0 +1,25 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Check that the safe to parallelize `fir.declare` op will not be parallelized
+// due to its operand %alloc not being reloaded outside the omp.single.
+
+func.func @foo() {
+  %c0 = arith.constant 0 : index
+  omp.workshare {
+    %alloc = fir.allocmem !fir.array<?xf32>, %c0 {bindc_name = ".tmp.forall", uniq_name = ""}
+    %shape = fir.shape %c0 : (index) -> !fir.shape<1>
+    %declare = fir.declare %alloc(%shape) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+    fir.freemem %alloc : !fir.heap<!fir.array<?xf32>>
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:    omp.single nowait
+// CHECK:      fir.allocmem
+// CHECK:      fir.shape
+// CHECK:      fir.declare
+// CHECK:      fir.freemem
+// CHECK:      omp.terminator
+// CHECK:    }
+// CHECK:    omp.barrier
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir b/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir
new file mode 100644
index 0000000000000..1fd379a6e5eb4
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir
@@ -0,0 +1,19 @@
+// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Check that we do not emit an omp.single for the constant operation
+
+func.func @foo() {
+  omp.workshare {
+    %c1 = arith.constant 1 : index
+    omp.workshare.loop_wrapper {
+      omp.loop_nest (%arg1) : index = (%c1) to (%c1) inclusive step (%c1) {
+        "test.test0"() : () -> ()
+        omp.yield
+      }
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-NOT: omp.single
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir b/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir
new file mode 100644
index 0000000000000..940662e0bdccc
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir
@@ -0,0 +1,23 @@
+// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Check that we correctly handle nowait
+
+// CHECK-LABEL:   func.func @nonowait
+func.func @nonowait(%arg0: !fir.ref<!fir.array<42xi32>>) {
+  // CHECK: omp.barrier
+  omp.workshare {
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @nowait
+func.func @nowait(%arg0: !fir.ref<!fir.array<42xi32>>) {
+  // CHECK-NOT: omp.barrier
+  omp.workshare nowait {
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir
new file mode 100644
index 0000000000000..83c49cd635d08
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir
@@ -0,0 +1,26 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | FileCheck %s
+
+// CHECK: warning: omp workshare with unstructured control flow is currently unsupported and will be serialized.
+
+// CHECK: omp.parallel
+// CHECK-NEXT: omp.single
+
+// TODO Check that the definition of %r dominates its use post-transform
+func.func @wsfunc() {
+  %a = fir.alloca i32
+  omp.parallel {
+    omp.workshare {
+    ^bb1:
+      %c1 = arith.constant 1 : i32
+      cf.br ^bb3(%c1: i32)
+    ^bb2:
+      "test.test2"(%r) : (i32) -> ()
+      omp.terminator
+    ^bb3(%arg1: i32):
+      %r = "test.test2"(%arg1) : (i32) -> i32
+      cf.br ^bb2
+    }
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir
new file mode 100644
index 0000000000000..a27cf88069401
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir
@@ -0,0 +1,23 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | FileCheck %s
+
+// CHECK: warning: omp workshare with unstructured control flow is currently unsupported and will be serialized.
+
+// CHECK: omp.parallel
+// CHECK-NEXT: omp.single
+
+// TODO Check transforming a simple CFG
+func.func @wsfunc() {
+  %a = fir.alloca i32
+  omp.parallel {
+    omp.workshare {
+    ^bb1:
+      %c1 = arith.constant 1 : i32
+      cf.br ^bb3(%c1: i32)
+    ^bb3(%arg1: i32):
+      "test.test2"(%arg1) : (i32) -> ()
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index fe5e36f704c76..1c24979bbcdaf 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -452,7 +452,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
 
     if (emitFIR && useHLFIR) {
       // lower HLFIR to FIR
-      fir::createHLFIRToFIRPassPipeline(pm, llvm::OptimizationLevel::O2);
+      fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP,
+                                        llvm::OptimizationLevel::O2);
       if (mlir::failed(pm.run(mlirModule))) {
         llvm::errs() << "FATAL: lowering from HLFIR to FIR failed";
         return mlir::failure();
@@ -467,6 +468,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
 
     // Add O2 optimizer pass pipeline.
     MLIRToLLVMPassPipelineConfig config(llvm::OptimizationLevel::O2);
+    if (enableOpenMP)
+      config.EnableOpenMP = true;
     config.NSWOnLoopVarInc = setNSW;
     fir::registerDefaultInlinerPass(config);
     fir::createDefaultFIROptimizerPassPipeline(pm, config);
diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp
index 5c373c4e85258..eaf4bae088454 100644
--- a/flang/tools/tco/tco.cpp
+++ b/flang/tools/tco/tco.cpp
@@ -139,6 +139,7 @@ compileFIR(const mlir::PassPipelineCLParser &passPipeline) {
       return mlir::failure();
   } else {
     MLIRToLLVMPassPipelineConfig config(llvm::OptimizationLevel::O2);
+    config.EnableOpenMP = true;  // assume the input contains OpenMP
     config.AliasAnalysis = true; // enabled when optimizing for speed
     if (codeGenLLVM) {
       // Run only CodeGen passes.
diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst
index ee5865d7f6407..e1244154341e9 100644
--- a/libc/docs/gpu/rpc.rst
+++ b/libc/docs/gpu/rpc.rst
@@ -302,6 +302,6 @@ associated with relocatable device code linking.
 Extensions
 ----------
 
-We describe which operation the RPC server should take with a 16-bit opcode. We
-consider the first 32768 numbers to be reserved while the others are free to
-use.
+The opcode is a 32-bit integer that must be unique to the requested operation. 
+All opcodes used by ``libc`` internally have the character ``c`` in the most 
+significant byte.
diff --git a/libc/fuzzing/__support/hashtable_fuzz.cpp b/libc/fuzzing/__support/hashtable_fuzz.cpp
index 7d61e106c9c4a..8ab5e3b55cfd4 100644
--- a/libc/fuzzing/__support/hashtable_fuzz.cpp
+++ b/libc/fuzzing/__support/hashtable_fuzz.cpp
@@ -10,6 +10,7 @@
 ///
 //===----------------------------------------------------------------------===//
 #include "include/llvm-libc-types/ENTRY.h"
+#include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/HashTable/table.h"
 #include "src/__support/macros/config.h"
@@ -81,15 +82,14 @@ static struct {
 
   template <typename T> T next() {
     static_assert(cpp::is_integral<T>::value, "T must be an integral type");
-    union {
-      T result;
-      char data[sizeof(T)];
-    };
-    for (size_t i = 0; i < sizeof(result); i++)
+
+    char data[sizeof(T)];
+
+    for (size_t i = 0; i < sizeof(T); i++)
       data[i] = buffer[i];
-    buffer += sizeof(result);
-    remaining -= sizeof(result);
-    return result;
+    buffer += sizeof(T);
+    remaining -= sizeof(T);
+    return cpp::bit_cast<T>(data);
   }
 
   cpp::string_view next_string() {
diff --git a/libc/include/llvm-libc-types/rpc_opcodes_t.h b/libc/include/llvm-libc-types/rpc_opcodes_t.h
index 1a6c0cd9bc4a1..f3b35518935a5 100644
--- a/libc/include/llvm-libc-types/rpc_opcodes_t.h
+++ b/libc/include/llvm-libc-types/rpc_opcodes_t.h
@@ -9,38 +9,41 @@
 #ifndef LLVM_LIBC_TYPES_RPC_OPCODES_T_H
 #define LLVM_LIBC_TYPES_RPC_OPCODES_T_H
 
+#define LLVM_LIBC_RPC_BASE 'c'
+#define LLVM_LIBC_OPCODE(n) (LLVM_LIBC_RPC_BASE << 24 | n)
+
 typedef enum {
-  RPC_NOOP = 0,
-  RPC_EXIT,
-  RPC_WRITE_TO_STDOUT,
-  RPC_WRITE_TO_STDERR,
-  RPC_WRITE_TO_STREAM,
-  RPC_WRITE_TO_STDOUT_NEWLINE,
-  RPC_READ_FROM_STREAM,
-  RPC_READ_FGETS,
-  RPC_OPEN_FILE,
-  RPC_CLOSE_FILE,
-  RPC_MALLOC,
-  RPC_FREE,
-  RPC_HOST_CALL,
-  RPC_ABORT,
-  RPC_FEOF,
-  RPC_FERROR,
-  RPC_CLEARERR,
-  RPC_FSEEK,
-  RPC_FTELL,
-  RPC_FFLUSH,
-  RPC_UNGETC,
-  RPC_PRINTF_TO_STDOUT,
-  RPC_PRINTF_TO_STDERR,
-  RPC_PRINTF_TO_STREAM,
-  RPC_PRINTF_TO_STDOUT_PACKED,
-  RPC_PRINTF_TO_STDERR_PACKED,
-  RPC_PRINTF_TO_STREAM_PACKED,
-  RPC_REMOVE,
-  RPC_RENAME,
-  RPC_SYSTEM,
-  RPC_LAST = 0xFFFF,
+  RPC_NOOP = LLVM_LIBC_OPCODE(0),
+  RPC_EXIT = LLVM_LIBC_OPCODE(1),
+  RPC_WRITE_TO_STDOUT = LLVM_LIBC_OPCODE(2),
+  RPC_WRITE_TO_STDERR = LLVM_LIBC_OPCODE(3),
+  RPC_WRITE_TO_STREAM = LLVM_LIBC_OPCODE(4),
+  RPC_WRITE_TO_STDOUT_NEWLINE = LLVM_LIBC_OPCODE(5),
+  RPC_READ_FROM_STREAM = LLVM_LIBC_OPCODE(6),
+  RPC_READ_FGETS = LLVM_LIBC_OPCODE(7),
+  RPC_OPEN_FILE = LLVM_LIBC_OPCODE(8),
+  RPC_CLOSE_FILE = LLVM_LIBC_OPCODE(9),
+  RPC_MALLOC = LLVM_LIBC_OPCODE(10),
+  RPC_FREE = LLVM_LIBC_OPCODE(11),
+  RPC_HOST_CALL = LLVM_LIBC_OPCODE(12),
+  RPC_ABORT = LLVM_LIBC_OPCODE(13),
+  RPC_FEOF = LLVM_LIBC_OPCODE(14),
+  RPC_FERROR = LLVM_LIBC_OPCODE(15),
+  RPC_CLEARERR = LLVM_LIBC_OPCODE(16),
+  RPC_FSEEK = LLVM_LIBC_OPCODE(17),
+  RPC_FTELL = LLVM_LIBC_OPCODE(18),
+  RPC_FFLUSH = LLVM_LIBC_OPCODE(19),
+  RPC_UNGETC = LLVM_LIBC_OPCODE(20),
+  RPC_PRINTF_TO_STDOUT = LLVM_LIBC_OPCODE(21),
+  RPC_PRINTF_TO_STDERR = LLVM_LIBC_OPCODE(22),
+  RPC_PRINTF_TO_STREAM = LLVM_LIBC_OPCODE(23),
+  RPC_PRINTF_TO_STDOUT_PACKED = LLVM_LIBC_OPCODE(24),
+  RPC_PRINTF_TO_STDERR_PACKED = LLVM_LIBC_OPCODE(25),
+  RPC_PRINTF_TO_STREAM_PACKED = LLVM_LIBC_OPCODE(26),
+  RPC_REMOVE = LLVM_LIBC_OPCODE(27),
+  RPC_RENAME = LLVM_LIBC_OPCODE(28),
+  RPC_SYSTEM = LLVM_LIBC_OPCODE(29),
+  RPC_LAST = 0xFFFFFFFF,
 } rpc_opcode_t;
 
 #endif // LLVM_LIBC_TYPES_RPC_OPCODES_T_H
diff --git a/libc/src/__support/HashTable/generic/bitmask_impl.inc b/libc/src/__support/HashTable/generic/bitmask_impl.inc
index 469ddeeed8a85..d526dc1ece293 100644
--- a/libc/src/__support/HashTable/generic/bitmask_impl.inc
+++ b/libc/src/__support/HashTable/generic/bitmask_impl.inc
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/endian_internal.h"
 #include "src/__support/macros/config.h"
@@ -44,13 +45,11 @@ struct Group {
 
   // Load a group of control words from an arbitary address.
   LIBC_INLINE static Group load(const void *addr) {
-    union {
-      bitmask_t value;
-      char bytes[sizeof(bitmask_t)];
-    } data;
+    char bytes[sizeof(bitmask_t)];
+
     for (size_t i = 0; i < sizeof(bitmask_t); ++i)
-      data.bytes[i] = static_cast<const char *>(addr)[i];
-    return {data.value};
+      bytes[i] = static_cast<const char *>(addr)[i];
+    return Group{cpp::bit_cast<bitmask_t>(bytes)};
   }
 
   // Load a group of control words from an aligned address.
diff --git a/libc/src/__support/OSUtil/gpu/exit.cpp b/libc/src/__support/OSUtil/gpu/exit.cpp
index 8aaa41b4e3eef..0cb266a42d180 100644
--- a/libc/src/__support/OSUtil/gpu/exit.cpp
+++ b/libc/src/__support/OSUtil/gpu/exit.cpp
@@ -8,6 +8,7 @@
 
 #include "src/__support/OSUtil/exit.h"
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index a257003a907de..30dd2c1a8125d 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -19,8 +19,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
 
 #include "rpc_util.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/GPU/utils.h"
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
 #include <stdint.h>
@@ -38,6 +37,9 @@ namespace rpc {
 #define __scoped_atomic_fetch_and(src, val, ord, scp)                          \
   __atomic_fetch_and(src, val, ord)
 #endif
+#if !__has_builtin(__scoped_atomic_thread_fence)
+#define __scoped_atomic_thread_fence(ord, scp) __atomic_thread_fence(ord)
+#endif
 
 /// A fixed size channel used to communicate between the RPC client and server.
 struct Buffer {
@@ -49,11 +51,11 @@ static_assert(sizeof(Buffer) == 64, "Buffer size mismatch");
 /// perform and which threads are active in the slots.
 struct Header {
   uint64_t mask;
-  uint16_t opcode;
+  uint32_t opcode;
 };
 
 /// The maximum number of parallel ports that the RPC interface can support.
-constexpr uint64_t MAX_PORT_COUNT = 4096;
+constexpr static uint64_t MAX_PORT_COUNT = 4096;
 
 /// A common process used to synchronize communication between a client and a
 /// server. The process contains a read-only inbox and a write-only outbox used
@@ -110,14 +112,14 @@ template <bool Invert> struct Process {
 
   /// Retrieve the inbox state from memory shared between processes.
   LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
-    return gpu::broadcast_value(
+    return rpc::broadcast_value(
         lane_mask, __scoped_atomic_load_n(&inbox[index], __ATOMIC_RELAXED,
                                           __MEMORY_SCOPE_SYSTEM));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
   LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
-    return gpu::broadcast_value(
+    return rpc::broadcast_value(
         lane_mask, __scoped_atomic_load_n(&outbox[index], __ATOMIC_RELAXED,
                                           __MEMORY_SCOPE_SYSTEM));
   }
@@ -128,7 +130,7 @@ template <bool Invert> struct Process {
   /// cheaper than calling load_outbox to get the value to store.
   LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
     uint32_t inverted_outbox = !current_outbox;
-    __atomic_thread_fence(__ATOMIC_RELEASE);
+    __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM);
     __scoped_atomic_store_n(&outbox[index], inverted_outbox, __ATOMIC_RELAXED,
                             __MEMORY_SCOPE_SYSTEM);
     return inverted_outbox;
@@ -142,7 +144,7 @@ template <bool Invert> struct Process {
       sleep_briefly();
       in = load_inbox(lane_mask, index);
     }
-    __atomic_thread_fence(__ATOMIC_ACQUIRE);
+    __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM);
   }
 
   /// The packet is a linearly allocated array of buffers used to communicate
@@ -162,7 +164,7 @@ template <bool Invert> struct Process {
 
   /// Attempt to claim the lock at index. Return true on lock taken.
   /// lane_mask is a bitmap of the threads in the warp that would hold the
-  /// single lock on success, e.g. the result of gpu::get_lane_mask()
+  /// single lock on success, e.g. the result of rpc::get_lane_mask()
   /// The lock is held when the n-th bit of the lock bitfield is set.
   LIBC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
     // On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
@@ -173,12 +175,12 @@ template <bool Invert> struct Process {
     // There may be threads active which are not in lane mask which must not
     // succeed in taking the lock, as otherwise it will leak. This is handled
     // by making threads which are not in lane_mask or with 0, a no-op.
-    uint32_t id = gpu::get_lane_id();
+    uint32_t id = rpc::get_lane_id();
     bool id_in_lane_mask = lane_mask & (1ul << id);
 
     // All threads in the warp call fetch_or. Possibly at the same time.
     bool before = set_nth(lock, index, id_in_lane_mask);
-    uint64_t packed = gpu::ballot(lane_mask, before);
+    uint64_t packed = rpc::ballot(lane_mask, before);
 
     // If every bit set in lane_mask is also set in packed, every single thread
     // in the warp failed to get the lock. Ballot returns unset for threads not
@@ -198,7 +200,7 @@ template <bool Invert> struct Process {
     // inlining the current function.
     bool holding_lock = lane_mask != packed;
     if (holding_lock)
-      __atomic_thread_fence(__ATOMIC_ACQUIRE);
+      __scoped_atomic_thread_fence(__ATOMIC_ACQUIRE, __MEMORY_SCOPE_DEVICE);
     return holding_lock;
   }
 
@@ -206,14 +208,14 @@ template <bool Invert> struct Process {
   /// convergent, otherwise the compiler will sink the store and deadlock.
   LIBC_INLINE void unlock(uint64_t lane_mask, uint32_t index) {
     // Do not move any writes past the unlock.
-    __atomic_thread_fence(__ATOMIC_RELEASE);
+    __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_DEVICE);
 
     // Use exactly one thread to clear the nth bit in the lock array Must
     // restrict to a single thread to avoid one thread dropping the lock, then
     // an unrelated warp claiming the lock, then a second thread in this warp
     // dropping the lock again.
-    clear_nth(lock, index, gpu::is_first_lane(lane_mask));
-    gpu::sync_lane(lane_mask);
+    clear_nth(lock, index, rpc::is_first_lane(lane_mask));
+    rpc::sync_lane(lane_mask);
   }
 
   /// Number of bytes to allocate for an inbox or outbox.
@@ -276,9 +278,9 @@ template <typename F>
 LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
                                    uint64_t lane_mask, Buffer *slot) {
   if constexpr (is_process_gpu()) {
-    fn(&slot[gpu::get_lane_id()], gpu::get_lane_id());
+    fn(&slot[rpc::get_lane_id()], rpc::get_lane_id());
   } else {
-    for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
+    for (uint32_t i = 0; i < lane_size; i += rpc::get_num_lanes())
       if (lane_mask & (1ul << i))
         fn(&slot[i], i);
   }
@@ -302,7 +304,7 @@ template <bool T> struct Port {
 
   friend struct Client;
   friend struct Server;
-  friend class cpp::optional<Port<T>>;
+  friend class rpc::optional<Port<T>>;
 
 public:
   template <typename U> LIBC_INLINE void recv(U use);
@@ -315,15 +317,15 @@ template <bool T> struct Port {
   template <typename A>
   LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
 
-  LIBC_INLINE uint16_t get_opcode() const {
+  LIBC_INLINE uint32_t get_opcode() const {
     return process.header[index].opcode;
   }
 
-  LIBC_INLINE uint16_t get_index() const { return index; }
+  LIBC_INLINE uint32_t get_index() const { return index; }
 
   LIBC_INLINE void close() {
     // Wait for all lanes to finish using the port.
-    gpu::sync_lane(lane_mask);
+    rpc::sync_lane(lane_mask);
 
     // The server is passive, if it own the buffer when it closes we need to
     // give ownership back to the client.
@@ -353,14 +355,11 @@ struct Client {
       : process(port_count, buffer) {}
 
   using Port = rpc::Port<false>;
-  template <uint16_t opcode> LIBC_INLINE Port open();
+  template <uint32_t opcode> LIBC_INLINE Port open();
 
 private:
   Process<false> process;
 };
-static_assert(cpp::is_trivially_copyable<Client>::value &&
-                  sizeof(Process<true>) == sizeof(Process<false>),
-              "The client is not trivially copyable from the server");
 
 /// The RPC server used to respond to the client.
 struct Server {
@@ -373,7 +372,7 @@ struct Server {
       : process(port_count, buffer) {}
 
   using Port = rpc::Port<true>;
-  LIBC_INLINE cpp::optional<Port> try_open(uint32_t lane_size,
+  LIBC_INLINE rpc::optional<Port> try_open(uint32_t lane_size,
                                            uint32_t start = 0);
   LIBC_INLINE Port open(uint32_t lane_size);
 
@@ -466,7 +465,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
   uint64_t mask = process.header[index].mask;
-  while (gpu::ballot(mask, idx < num_sends)) {
+  while (rpc::ballot(mask, idx < num_sends)) {
     send([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
                          ? sizeof(Buffer::data)
@@ -499,7 +498,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
   uint64_t mask = process.header[index].mask;
-  while (gpu::ballot(mask, idx < num_recvs)) {
+  while (rpc::ballot(mask, idx < num_recvs)) {
     recv([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
                          ? sizeof(Buffer::data)
@@ -517,16 +516,16 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 /// port. Each port instance uses an associated \p opcode to tell the server
 /// what to do. The Client interface provides the appropriate lane size to the
 /// port using the platform's returned value.
-template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
+template <uint32_t opcode> LIBC_INLINE Client::Port Client::open() {
   // Repeatedly perform a naive linear scan for a port that can be opened to
   // send data.
-  for (uint32_t index = gpu::get_cluster_id();; ++index) {
+  for (uint32_t index = 0;; ++index) {
     // Start from the beginning if we run out of ports to check.
     if (index >= process.port_count)
       index = 0;
 
     // Attempt to acquire the lock on this index.
-    uint64_t lane_mask = gpu::get_lane_mask();
+    uint64_t lane_mask = rpc::get_lane_mask();
     if (!process.try_lock(lane_mask, index))
       continue;
 
@@ -540,22 +539,22 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
       continue;
     }
 
-    if (gpu::is_first_lane(lane_mask)) {
+    if (rpc::is_first_lane(lane_mask)) {
       process.header[index].opcode = opcode;
       process.header[index].mask = lane_mask;
     }
-    gpu::sync_lane(lane_mask);
-    return Port(process, lane_mask, gpu::get_lane_size(), index, out);
+    rpc::sync_lane(lane_mask);
+    return Port(process, lane_mask, rpc::get_num_lanes(), index, out);
   }
 }
 
 /// Attempts to open a port to use as the server. The server can only open a
 /// port if it has a pending receive operation
-LIBC_INLINE cpp::optional<typename Server::Port>
+LIBC_INLINE rpc::optional<typename Server::Port>
 Server::try_open(uint32_t lane_size, uint32_t start) {
   // Perform a naive linear scan for a port that has a pending request.
   for (uint32_t index = start; index < process.port_count; ++index) {
-    uint64_t lane_mask = gpu::get_lane_mask();
+    uint64_t lane_mask = rpc::get_lane_mask();
     uint32_t in = process.load_inbox(lane_mask, index);
     uint32_t out = process.load_outbox(lane_mask, index);
 
@@ -578,13 +577,13 @@ Server::try_open(uint32_t lane_size, uint32_t start) {
 
     return Port(process, lane_mask, lane_size, index, out);
   }
-  return cpp::nullopt;
+  return rpc::nullopt;
 }
 
 LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
   for (;;) {
-    if (cpp::optional<Server::Port> p = try_open(lane_size))
-      return cpp::move(p.value());
+    if (rpc::optional<Server::Port> p = try_open(lane_size))
+      return rpc::move(p.value());
     sleep_briefly();
   }
 }
@@ -595,6 +594,9 @@ LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
 #undef __scoped_atomic_fetch_or
 #undef __scoped_atomic_fetch_and
 #endif
+#if !__has_builtin(__scoped_atomic_thread_fence)
+#undef __scoped_atomic_thread_fence
+#endif
 
 } // namespace rpc
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h
index 695b6b7515bf7..7bd6d0b5e00b4 100644
--- a/libc/src/__support/RPC/rpc_client.h
+++ b/libc/src/__support/RPC/rpc_client.h
@@ -12,11 +12,16 @@
 #include "rpc.h"
 
 #include "include/llvm-libc-types/rpc_opcodes_t.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace rpc {
 
+static_assert(cpp::is_trivially_copyable<Client>::value &&
+                  sizeof(Process<true>) == sizeof(Process<false>),
+              "The client is not trivially copyable from the server");
+
 /// The libc client instance used to communicate with the server.
 extern Client client;
 
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index 93b8289617484..7067dfc974eb3 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -9,23 +9,230 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
 #define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
 
-#include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/threads/sleep.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+#include <gpuintrin.h>
+#define RPC_TARGET_IS_GPU
+#endif
 
 namespace LIBC_NAMESPACE_DECL {
 namespace rpc {
 
+template <typename T> struct type_identity {
+  using type = T;
+};
+
+template <class T, T v> struct type_constant {
+  static inline constexpr T value = v;
+};
+
+template <class T> struct remove_reference : type_identity<T> {};
+template <class T> struct remove_reference<T &> : type_identity<T> {};
+template <class T> struct remove_reference<T &&> : type_identity<T> {};
+
+template <class T> struct is_const : type_constant<bool, false> {};
+template <class T> struct is_const<const T> : type_constant<bool, true> {};
+
+/// Freestanding implementation of std::move.
+template <class T>
+LIBC_INLINE constexpr typename remove_reference<T>::type &&move(T &&t) {
+  return static_cast<typename remove_reference<T>::type &&>(t);
+}
+
+/// Freestanding implementation of std::forward.
+template <typename T>
+LIBC_INLINE constexpr T &&forward(typename remove_reference<T>::type &value) {
+  return static_cast<T &&>(value);
+}
+template <typename T>
+LIBC_INLINE constexpr T &&forward(typename remove_reference<T>::type &&value) {
+  return static_cast<T &&>(value);
+}
+
+struct in_place_t {
+  LIBC_INLINE explicit in_place_t() = default;
+};
+
+struct nullopt_t {
+  LIBC_INLINE constexpr explicit nullopt_t() = default;
+};
+
+constexpr inline in_place_t in_place{};
+constexpr inline nullopt_t nullopt{};
+
+/// Freestanding and minimal implementation of std::optional.
+template <typename T> class optional {
+  template <typename U> struct OptionalStorage {
+    union {
+      char empty;
+      U stored_value;
+    };
+
+    bool in_use = false;
+
+    LIBC_INLINE ~OptionalStorage() { reset(); }
+
+    LIBC_INLINE constexpr OptionalStorage() : empty() {}
+
+    template <typename... Args>
+    LIBC_INLINE constexpr explicit OptionalStorage(in_place_t, Args &&...args)
+        : stored_value(forward<Args>(args)...) {}
+
+    LIBC_INLINE constexpr void reset() {
+      if (in_use)
+        stored_value.~U();
+      in_use = false;
+    }
+  };
+
+  OptionalStorage<T> storage;
+
+public:
+  LIBC_INLINE constexpr optional() = default;
+  LIBC_INLINE constexpr optional(nullopt_t) {}
+
+  LIBC_INLINE constexpr optional(const T &t) : storage(in_place, t) {
+    storage.in_use = true;
+  }
+  LIBC_INLINE constexpr optional(const optional &) = default;
+
+  LIBC_INLINE constexpr optional(T &&t) : storage(in_place, move(t)) {
+    storage.in_use = true;
+  }
+  LIBC_INLINE constexpr optional(optional &&O) = default;
+
+  LIBC_INLINE constexpr optional &operator=(T &&t) {
+    storage = move(t);
+    return *this;
+  }
+  LIBC_INLINE constexpr optional &operator=(optional &&) = default;
+
+  LIBC_INLINE constexpr optional &operator=(const T &t) {
+    storage = t;
+    return *this;
+  }
+  LIBC_INLINE constexpr optional &operator=(const optional &) = default;
+
+  LIBC_INLINE constexpr void reset() { storage.reset(); }
+
+  LIBC_INLINE constexpr const T &value() const & {
+    return storage.stored_value;
+  }
+
+  LIBC_INLINE constexpr T &value() & { return storage.stored_value; }
+
+  LIBC_INLINE constexpr explicit operator bool() const {
+    return storage.in_use;
+  }
+  LIBC_INLINE constexpr bool has_value() const { return storage.in_use; }
+  LIBC_INLINE constexpr const T *operator->() const {
+    return &storage.stored_value;
+  }
+  LIBC_INLINE constexpr T *operator->() { return &storage.stored_value; }
+  LIBC_INLINE constexpr const T &operator*() const & {
+    return storage.stored_value;
+  }
+  LIBC_INLINE constexpr T &operator*() & { return storage.stored_value; }
+
+  LIBC_INLINE constexpr T &&value() && { return move(storage.stored_value); }
+  LIBC_INLINE constexpr T &&operator*() && {
+    return move(storage.stored_value);
+  }
+};
+
+/// Suspend the thread briefly to assist the thread scheduler during busy loops.
+LIBC_INLINE void sleep_briefly() {
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    asm("nanosleep.u32 64;" ::: "memory");
+#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+  __builtin_amdgcn_s_sleep(2);
+#elif defined(LIBC_TARGET_ARCH_IS_X86)
+  __builtin_ia32_pause();
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && __has_builtin(__builtin_arm_isb)
+  __builtin_arm_isb(0xf);
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  asm volatile("isb\n" ::: "memory");
+#else
+  // Simply do nothing if sleeping isn't supported on this platform.
+#endif
+}
+
 /// Conditional to indicate if this process is running on the GPU.
 LIBC_INLINE constexpr bool is_process_gpu() {
-#if defined(__NVPTX__) || defined(__AMDGPU__)
+#ifdef RPC_TARGET_IS_GPU
   return true;
 #else
   return false;
 #endif
 }
 
+/// Wait for all lanes in the group to complete.
+LIBC_INLINE void sync_lane(uint64_t lane_mask) {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_sync_lane(lane_mask);
+#endif
+}
+
+/// Copies the value from the first active thread to the rest.
+LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_read_first_lane_u32(lane_mask, x);
+#else
+  return x;
+#endif
+}
+
+/// Returns the number lanes that participate in the RPC interface.
+LIBC_INLINE uint32_t get_num_lanes() {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_num_lanes();
+#else
+  return 1;
+#endif
+}
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+LIBC_INLINE uint64_t get_lane_mask() {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_lane_mask();
+#else
+  return 1;
+#endif
+}
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+LIBC_INLINE uint32_t get_lane_id() {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_lane_id();
+#else
+  return 0;
+#endif
+}
+
+/// Conditional that is only true for a single thread in a lane.
+LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_is_first_in_lane(lane_mask);
+#else
+  return true;
+#endif
+}
+
+/// Returns a bitmask of threads in the current lane for which \p x is true.
+LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_ballot(lane_mask, x);
+#else
+  return x;
+#endif
+}
+
 /// Return \p val aligned "upwards" according to \p align.
 template <typename V, typename A>
 LIBC_INLINE constexpr V align_up(V val, A align) {
@@ -44,7 +251,7 @@ template <typename V> LIBC_INLINE V &lane_value(V *val, uint32_t id) {
 
 /// Advance the \p p by \p bytes.
 template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
-  if constexpr (cpp::is_const_v<T>)
+  if constexpr (is_const<T>::value)
     return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) +
                                  bytes);
   else
diff --git a/libc/src/__support/common.h b/libc/src/__support/common.h
index 48c773fa02c17..79803a346f692 100644
--- a/libc/src/__support/common.h
+++ b/libc/src/__support/common.h
@@ -21,9 +21,25 @@
 #define LLVM_LIBC_FUNCTION_ATTR
 #endif
 
+// Allow each function `func` to have extra attributes specified by defining:
+// `LLVM_LIBC_FUNCTION_ATTR_func` macro, which should always start with
+// "LLVM_LIBC_EMPTY, "
+//
+// For examples:
+// #define LLVM_LIBC_FUNCTION_ATTR_memcpy LLVM_LIBC_EMPTY, [[gnu::weak]]
+// #define LLVM_LIBC_FUNCTION_ATTR_memchr LLVM_LIBC_EMPTY, [[gnu::weak]]       \
+//                                        [[gnu::visibility("default")]]
+#define LLVM_LIBC_EMPTY
+
+#define GET_SECOND(first, second, ...) second
+#define EXPAND_THEN_SECOND(name) GET_SECOND(name, LLVM_LIBC_EMPTY)
+
+#define LLVM_LIBC_ATTR(name) EXPAND_THEN_SECOND(LLVM_LIBC_FUNCTION_ATTR_##name)
+
 // MacOS needs to be excluded because it does not support aliasing.
 #if defined(LIBC_COPT_PUBLIC_PACKAGING) && (!defined(__APPLE__))
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist)                           \
+  LLVM_LIBC_ATTR(name)                                                         \
   LLVM_LIBC_FUNCTION_ATTR decltype(LIBC_NAMESPACE::name)                       \
       __##name##_impl__ __asm__(#name);                                        \
   decltype(LIBC_NAMESPACE::name) name [[gnu::alias(#name)]];                   \
diff --git a/libc/src/__support/hash.h b/libc/src/__support/hash.h
index 527c83993fd59..49138b1f43b9e 100644
--- a/libc/src/__support/hash.h
+++ b/libc/src/__support/hash.h
@@ -13,8 +13,8 @@
 #include "src/__support/CPP/limits.h"        // numeric_limits
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"
-#include "src/__support/uint128.h"           // UInt128
-#include <stdint.h>                          // For uint64_t
+#include "src/__support/uint128.h" // UInt128
+#include <stdint.h>                // For uint64_t
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
@@ -34,25 +34,23 @@ LIBC_INLINE uint64_t folded_multiply(uint64_t x, uint64_t y) {
 // Therefore, we use a union to read the value.
 template <typename T> LIBC_INLINE T read_little_endian(const void *ptr) {
   const uint8_t *bytes = static_cast<const uint8_t *>(ptr);
-  union {
-    T value;
-    uint8_t buffer[sizeof(T)];
-  } data;
+  uint8_t buffer[sizeof(T)];
 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
-  // Compiler should able to optimize this as a load followed by a byte swap.
-  // On aarch64 (-mbig-endian), this compiles to the following for int:
+  // Compiler should able to optimize this as a load followed by a byte
+  // swap. On aarch64 (-mbig-endian), this compiles to the following for
+  // int:
   //      ldr     w0, [x0]
   //      rev     w0, w0
   //      ret
   for (size_t i = 0; i < sizeof(T); ++i) {
-    data.buffer[i] = bytes[sizeof(T) - i - 1];
+    buffer[i] = bytes[sizeof(T) - i - 1];
   }
 #else
   for (size_t i = 0; i < sizeof(T); ++i) {
-    data.buffer[i] = bytes[i];
+    buffer[i] = bytes[i];
   }
 #endif
-  return data.value;
+  return cpp::bit_cast<T>(buffer);
 }
 
 // Specialized read functions for small values. size must be <= 8.
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
index 9f2c1959fa5ec..449aedf254ca5 100644
--- a/libc/src/math/generic/exp10m1f16.cpp
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -119,6 +119,9 @@ LLVM_LIBC_FUNCTION(float16, exp10m1f16, (float16 x)) {
 
     // When |x| <= 2^(-3).
     if (x_abs <= 0x3000U) {
+      if (LIBC_UNLIKELY(x_abs == 0))
+        return x;
+
       if (auto r = EXP10M1F16_EXCEPTS_LO.lookup(x_u);
           LIBC_UNLIKELY(r.has_value()))
         return r.value();
diff --git a/libc/src/math/generic/tanhf16.cpp b/libc/src/math/generic/tanhf16.cpp
index ae9b4be46f7cf..0266b5cfc2df1 100644
--- a/libc/src/math/generic/tanhf16.cpp
+++ b/libc/src/math/generic/tanhf16.cpp
@@ -64,6 +64,9 @@ LLVM_LIBC_FUNCTION(float16, tanhf16, (float16 x)) {
 
     // When |x| <= 0x1.d2p-4.
     if (x_abs <= 0x2f48U) {
+      if (LIBC_UNLIKELY(x_abs == 0))
+        return x;
+
       float xf = x;
       float xf_sq = xf * xf;
       // Degree-7 Taylor expansion generated by Sollya with the following
diff --git a/libc/src/stdio/gpu/file.h b/libc/src/stdio/gpu/file.h
index 16d64e8f37750..6ca792b454580 100644
--- a/libc/src/stdio/gpu/file.h
+++ b/libc/src/stdio/gpu/file.h
@@ -49,7 +49,7 @@ LIBC_INLINE ::FILE *to_stream(uintptr_t f) {
   return stream;
 }
 
-template <uint16_t opcode>
+template <uint32_t opcode>
 LIBC_INLINE uint64_t write_impl(::FILE *file, const void *data, size_t size) {
   uint64_t ret = 0;
   rpc::Client::Port port = rpc::client.open<opcode>();
diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h
index 5010ee16d9607..a0a8c39781ad6 100644
--- a/libc/src/stdio/gpu/vfprintf_utils.h
+++ b/libc/src/stdio/gpu/vfprintf_utils.h
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/types/FILE.h"
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,7 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-template <uint16_t opcode>
+template <uint32_t opcode>
 LIBC_INLINE int vfprintf_impl(::FILE *__restrict file,
                               const char *__restrict format, size_t format_size,
                               va_list vlist) {
diff --git a/libc/src/stdlib/gpu/abort.cpp b/libc/src/stdlib/gpu/abort.cpp
index cfc7e9b8e228b..3a06fb38c3f64 100644
--- a/libc/src/stdlib/gpu/abort.cpp
+++ b/libc/src/stdlib/gpu/abort.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/test/integration/scudo/CMakeLists.txt b/libc/test/integration/scudo/CMakeLists.txt
index a5f7e3b63d24c..b4011e501b96b 100644
--- a/libc/test/integration/scudo/CMakeLists.txt
+++ b/libc/test/integration/scudo/CMakeLists.txt
@@ -13,14 +13,23 @@ endif()
 add_entrypoint_library(
   libc_for_scudo_integration_test
   DEPENDS
-    libc.src.stdlib.malloc
-    libc.src.stdlib.calloc
-    libc.src.stdlib.realloc
+    libc.src.errno.errno
+    libc.src.fcntl.open
+    libc.src.sched.__sched_getcpucount
     libc.src.stdlib.aligned_alloc
+    libc.src.stdlib.calloc
     libc.src.stdlib.free
-    libc.src.errno.errno
+    libc.src.stdlib.malloc
+    libc.src.stdlib.realloc
+    libc.src.sys.auxv.getauxval
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.munmap
+    libc.src.sys.prctl.prctl
     libc.src.unistd.__llvm_libc_syscall
-    libc.src.sched.__sched_getcpucount
+    libc.src.unistd.close
+    libc.src.unistd.read
+    libc.src.unistd.sysconf
+    libc.src.unistd.write
 )
 
 add_executable(
diff --git a/libc/test/src/__support/HashTable/group_test.cpp b/libc/test/src/__support/HashTable/group_test.cpp
index 25b15312ad668..acdc58e205852 100644
--- a/libc/test/src/__support/HashTable/group_test.cpp
+++ b/libc/test/src/__support/HashTable/group_test.cpp
@@ -8,6 +8,7 @@
 
 #include "src/__support/HashTable/bitmask.h"
 
+#include "src/__support/CPP/bit.h"
 #include "src/__support/macros/config.h"
 #include "src/stdlib/rand.h"
 #include "test/UnitTest/Test.h"
@@ -28,14 +29,13 @@ TEST(LlvmLibcHashTableBitMaskTest, Match) {
   size_t appearance[4][sizeof(Group)];
   ByteArray array{};
 
-  union {
-    uintptr_t random;
-    int data[sizeof(uintptr_t) / sizeof(int)];
-  };
+  int data[sizeof(uintptr_t) / sizeof(int)];
 
   for (int &i : data)
     i = rand();
 
+  uintptr_t random = cpp::bit_cast<uintptr_t>(data);
+
   for (size_t i = 0; i < sizeof(Group); ++i) {
     size_t choice = random % 4;
     random /= 4;
@@ -62,14 +62,13 @@ TEST(LlvmLibcHashTableBitMaskTest, MaskAvailable) {
   for (size_t i = 0; i < sizeof(Group); ++i) {
     ByteArray array{};
 
-    union {
-      uintptr_t random;
-      int data[sizeof(uintptr_t) / sizeof(int)];
-    };
+    int data[sizeof(uintptr_t) / sizeof(int)];
 
     for (int &j : data)
       j = rand();
 
+    uintptr_t random = cpp::bit_cast<uintptr_t>(data);
+
     ASSERT_FALSE(Group::load(array.data).mask_available().any_bit_set());
 
     array.data[i] = 0x80;
diff --git a/libc/test/src/__support/HashTable/table_test.cpp b/libc/test/src/__support/HashTable/table_test.cpp
index f8ffa4d4123d3..c3b8697f2087a 100644
--- a/libc/test/src/__support/HashTable/table_test.cpp
+++ b/libc/test/src/__support/HashTable/table_test.cpp
@@ -82,7 +82,7 @@ TEST(LlvmLibcTableTest, GrowthSequence) {
 }
 
 TEST(LlvmLibcTableTest, Insertion) {
-  union key {
+  struct key {
     char bytes[2];
   } keys[256];
   for (size_t k = 0; k < 256; ++k) {
diff --git a/libc/utils/gpu/server/llvmlibc_rpc_server.h b/libc/utils/gpu/server/llvmlibc_rpc_server.h
index b0cf2f916b385..98df882afa21c 100644
--- a/libc/utils/gpu/server/llvmlibc_rpc_server.h
+++ b/libc/utils/gpu/server/llvmlibc_rpc_server.h
@@ -79,7 +79,7 @@ rpc_status_t rpc_handle_server(rpc_device_t rpc_device);
 /// Register a callback to handle an opcode from the RPC client. The associated
 /// data must remain accessible as long as the user intends to handle the server
 /// with this callback.
-rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint16_t opcode,
+rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint32_t opcode,
                                    rpc_opcode_callback_ty callback, void *data);
 
 /// Obtain a pointer to a local client buffer that can be copied directly to the
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 11b6d0e27ab94..972601aaf1d5e 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -215,8 +215,8 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
 template <uint32_t lane_size>
 rpc_status_t handle_server_impl(
     rpc::Server &server,
-    const std::unordered_map<uint16_t, rpc_opcode_callback_ty> &callbacks,
-    const std::unordered_map<uint16_t, void *> &callback_data,
+    const std::unordered_map<uint32_t, rpc_opcode_callback_ty> &callbacks,
+    const std::unordered_map<uint32_t, void *> &callback_data,
     uint32_t &index) {
   auto port = server.try_open(lane_size, index);
   if (!port)
@@ -477,8 +477,8 @@ struct Device {
   void *buffer;
   rpc::Server server;
   rpc::Client client;
-  std::unordered_map<uint16_t, rpc_opcode_callback_ty> callbacks;
-  std::unordered_map<uint16_t, void *> callback_data;
+  std::unordered_map<uint32_t, rpc_opcode_callback_ty> callbacks;
+  std::unordered_map<uint32_t, void *> callback_data;
 };
 
 rpc_status_t rpc_server_init(rpc_device_t *rpc_device, uint64_t num_ports,
@@ -528,7 +528,7 @@ rpc_status_t rpc_handle_server(rpc_device_t rpc_device) {
   }
 }
 
-rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint16_t opcode,
+rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint32_t opcode,
                                    rpc_opcode_callback_ty callback,
                                    void *data) {
   if (!rpc_device.handle)
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 83168d153a7f4..0ae031e5365ae 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -205,7 +205,6 @@ set(files
   __assert
   __atomic/aliases.h
   __atomic/atomic.h
-  __atomic/atomic_base.h
   __atomic/atomic_flag.h
   __atomic/atomic_init.h
   __atomic/atomic_lock_free.h
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index 113475cb1f007..ae0475693f22b 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -9,9 +9,10 @@
 #ifndef _LIBCPP___ATOMIC_ATOMIC_H
 #define _LIBCPP___ATOMIC_ATOMIC_H
 
-#include <__atomic/atomic_base.h>
+#include <__atomic/atomic_sync.h>
 #include <__atomic/check_memory_order.h>
 #include <__atomic/cxx_atomic_impl.h>
+#include <__atomic/is_always_lock_free.h>
 #include <__atomic/memory_order.h>
 #include <__config>
 #include <__cstddef/ptrdiff_t.h>
@@ -21,6 +22,7 @@
 #include <__type_traits/is_floating_point.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_integral.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/remove_const.h>
 #include <__type_traits/remove_pointer.h>
@@ -34,6 +36,197 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Tp, bool = is_integral<_Tp>::value && !is_same<_Tp, bool>::value>
+struct __atomic_base // false
+{
+  mutable __cxx_atomic_impl<_Tp> __a_;
+
+#if _LIBCPP_STD_VER >= 17
+  static constexpr bool is_always_lock_free = __libcpp_is_always_lock_free<__cxx_atomic_impl<_Tp> >::__value;
+#endif
+
+  _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const volatile _NOEXCEPT {
+    return __cxx_atomic_is_lock_free(sizeof(__cxx_atomic_impl<_Tp>));
+  }
+  _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const _NOEXCEPT {
+    return static_cast<__atomic_base const volatile*>(this)->is_lock_free();
+  }
+  _LIBCPP_HIDE_FROM_ABI void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT
+      _LIBCPP_CHECK_STORE_MEMORY_ORDER(__m) {
+    std::__cxx_atomic_store(std::addressof(__a_), __d, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI void store(_Tp __d, memory_order __m = memory_order_seq_cst) _NOEXCEPT
+      _LIBCPP_CHECK_STORE_MEMORY_ORDER(__m) {
+    std::__cxx_atomic_store(std::addressof(__a_), __d, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp load(memory_order __m = memory_order_seq_cst) const volatile _NOEXCEPT
+      _LIBCPP_CHECK_LOAD_MEMORY_ORDER(__m) {
+    return std::__cxx_atomic_load(std::addressof(__a_), __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp load(memory_order __m = memory_order_seq_cst) const _NOEXCEPT
+      _LIBCPP_CHECK_LOAD_MEMORY_ORDER(__m) {
+    return std::__cxx_atomic_load(std::addressof(__a_), __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI operator _Tp() const volatile _NOEXCEPT { return load(); }
+  _LIBCPP_HIDE_FROM_ABI operator _Tp() const _NOEXCEPT { return load(); }
+  _LIBCPP_HIDE_FROM_ABI _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
+    return std::__cxx_atomic_exchange(std::addressof(__a_), __d, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
+    return std::__cxx_atomic_exchange(std::addressof(__a_), __d, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile _NOEXCEPT
+      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) {
+    return std::__cxx_atomic_compare_exchange_weak(std::addressof(__a_), std::addressof(__e), __d, __s, __f);
+  }
+  _LIBCPP_HIDE_FROM_ABI bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) _NOEXCEPT
+      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) {
+    return std::__cxx_atomic_compare_exchange_weak(std::addressof(__a_), std::addressof(__e), __d, __s, __f);
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile _NOEXCEPT
+      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) {
+    return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __s, __f);
+  }
+  _LIBCPP_HIDE_FROM_ABI bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) _NOEXCEPT
+      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) {
+    return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __s, __f);
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
+    return std::__cxx_atomic_compare_exchange_weak(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
+    return std::__cxx_atomic_compare_exchange_weak(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
+    return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
+    return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
+  }
+
+#if _LIBCPP_STD_VER >= 20
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const
+      volatile _NOEXCEPT {
+    std::__atomic_wait(*this, __v, __m);
+  }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+  wait(_Tp __v, memory_order __m = memory_order_seq_cst) const _NOEXCEPT {
+    std::__atomic_wait(*this, __v, __m);
+  }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT {
+    std::__atomic_notify_one(*this);
+  }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT { std::__atomic_notify_one(*this); }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() volatile _NOEXCEPT {
+    std::__atomic_notify_all(*this);
+  }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { std::__atomic_notify_all(*this); }
+#endif //  _LIBCPP_STD_VER >= 20
+
+#if _LIBCPP_STD_VER >= 20
+  _LIBCPP_HIDE_FROM_ABI constexpr __atomic_base() noexcept(is_nothrow_default_constructible_v<_Tp>) : __a_(_Tp()) {}
+#else
+  _LIBCPP_HIDE_FROM_ABI __atomic_base() _NOEXCEPT = default;
+#endif
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __atomic_base(_Tp __d) _NOEXCEPT : __a_(__d) {}
+
+  __atomic_base(const __atomic_base&) = delete;
+};
+
+// atomic<Integral>
+
+template <class _Tp>
+struct __atomic_base<_Tp, true> : public __atomic_base<_Tp, false> {
+  using __base = __atomic_base<_Tp, false>;
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __atomic_base() _NOEXCEPT = default;
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __atomic_base(_Tp __d) _NOEXCEPT : __base(__d) {}
+
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
+    return std::__cxx_atomic_fetch_add(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
+    return std::__cxx_atomic_fetch_add(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
+    return std::__cxx_atomic_fetch_sub(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
+    return std::__cxx_atomic_fetch_sub(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
+    return std::__cxx_atomic_fetch_and(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
+    return std::__cxx_atomic_fetch_and(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
+    return std::__cxx_atomic_fetch_or(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
+    return std::__cxx_atomic_fetch_or(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
+    return std::__cxx_atomic_fetch_xor(std::addressof(this->__a_), __op, __m);
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
+    return std::__cxx_atomic_fetch_xor(std::addressof(this->__a_), __op, __m);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator++(int) volatile _NOEXCEPT { return fetch_add(_Tp(1)); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator++(int) _NOEXCEPT { return fetch_add(_Tp(1)); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator--(int) volatile _NOEXCEPT { return fetch_sub(_Tp(1)); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator--(int) _NOEXCEPT { return fetch_sub(_Tp(1)); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator++() volatile _NOEXCEPT { return fetch_add(_Tp(1)) + _Tp(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator++() _NOEXCEPT { return fetch_add(_Tp(1)) + _Tp(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator--() volatile _NOEXCEPT { return fetch_sub(_Tp(1)) - _Tp(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator--() _NOEXCEPT { return fetch_sub(_Tp(1)) - _Tp(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator+=(_Tp __op) volatile _NOEXCEPT { return fetch_add(__op) + __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator+=(_Tp __op) _NOEXCEPT { return fetch_add(__op) + __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator-=(_Tp __op) volatile _NOEXCEPT { return fetch_sub(__op) - __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator-=(_Tp __op) _NOEXCEPT { return fetch_sub(__op) - __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator&=(_Tp __op) volatile _NOEXCEPT { return fetch_and(__op) & __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator&=(_Tp __op) _NOEXCEPT { return fetch_and(__op) & __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator|=(_Tp __op) volatile _NOEXCEPT { return fetch_or(__op) | __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator|=(_Tp __op) _NOEXCEPT { return fetch_or(__op) | __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator^=(_Tp __op) volatile _NOEXCEPT { return fetch_xor(__op) ^ __op; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator^=(_Tp __op) _NOEXCEPT { return fetch_xor(__op) ^ __op; }
+};
+
+// Here we need _IsIntegral because the default template argument is not enough
+// e.g  __atomic_base<int> is __atomic_base<int, true>, which inherits from
+// __atomic_base<int, false> and the caller of the wait function is
+// __atomic_base<int, false>. So specializing __atomic_base<_Tp> does not work
+template <class _Tp, bool _IsIntegral>
+struct __atomic_waitable_traits<__atomic_base<_Tp, _IsIntegral> > {
+  static _LIBCPP_HIDE_FROM_ABI _Tp __atomic_load(const __atomic_base<_Tp, _IsIntegral>& __a, memory_order __order) {
+    return __a.load(__order);
+  }
+
+  static _LIBCPP_HIDE_FROM_ABI _Tp
+  __atomic_load(const volatile __atomic_base<_Tp, _IsIntegral>& __this, memory_order __order) {
+    return __this.load(__order);
+  }
+
+  static _LIBCPP_HIDE_FROM_ABI const __cxx_atomic_impl<_Tp>*
+  __atomic_contention_address(const __atomic_base<_Tp, _IsIntegral>& __a) {
+    return std::addressof(__a.__a_);
+  }
+
+  static _LIBCPP_HIDE_FROM_ABI const volatile __cxx_atomic_impl<_Tp>*
+  __atomic_contention_address(const volatile __atomic_base<_Tp, _IsIntegral>& __this) {
+    return std::addressof(__this.__a_);
+  }
+};
+
 template <class _Tp>
 struct atomic : public __atomic_base<_Tp> {
   using __base          = __atomic_base<_Tp>;
@@ -123,6 +316,9 @@ struct atomic<_Tp*> : public __atomic_base<_Tp*> {
   atomic& operator=(const atomic&) volatile = delete;
 };
 
+template <class _Tp>
+struct __atomic_waitable_traits<atomic<_Tp> > : __atomic_waitable_traits<__atomic_base<_Tp> > {};
+
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp>
   requires is_floating_point_v<_Tp>
diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h
deleted file mode 100644
index 93f5c4cff0d1b..0000000000000
--- a/libcxx/include/__atomic/atomic_base.h
+++ /dev/null
@@ -1,223 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___ATOMIC_ATOMIC_BASE_H
-#define _LIBCPP___ATOMIC_ATOMIC_BASE_H
-
-#include <__atomic/atomic_sync.h>
-#include <__atomic/check_memory_order.h>
-#include <__atomic/cxx_atomic_impl.h>
-#include <__atomic/is_always_lock_free.h>
-#include <__atomic/memory_order.h>
-#include <__config>
-#include <__memory/addressof.h>
-#include <__type_traits/is_integral.h>
-#include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_same.h>
-#include <version>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp, bool = is_integral<_Tp>::value && !is_same<_Tp, bool>::value>
-struct __atomic_base // false
-{
-  mutable __cxx_atomic_impl<_Tp> __a_;
-
-#if _LIBCPP_STD_VER >= 17
-  static constexpr bool is_always_lock_free = __libcpp_is_always_lock_free<__cxx_atomic_impl<_Tp> >::__value;
-#endif
-
-  _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const volatile _NOEXCEPT {
-    return __cxx_atomic_is_lock_free(sizeof(__cxx_atomic_impl<_Tp>));
-  }
-  _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const _NOEXCEPT {
-    return static_cast<__atomic_base const volatile*>(this)->is_lock_free();
-  }
-  _LIBCPP_HIDE_FROM_ABI void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT
-      _LIBCPP_CHECK_STORE_MEMORY_ORDER(__m) {
-    std::__cxx_atomic_store(std::addressof(__a_), __d, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI void store(_Tp __d, memory_order __m = memory_order_seq_cst) _NOEXCEPT
-      _LIBCPP_CHECK_STORE_MEMORY_ORDER(__m) {
-    std::__cxx_atomic_store(std::addressof(__a_), __d, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp load(memory_order __m = memory_order_seq_cst) const volatile _NOEXCEPT
-      _LIBCPP_CHECK_LOAD_MEMORY_ORDER(__m) {
-    return std::__cxx_atomic_load(std::addressof(__a_), __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp load(memory_order __m = memory_order_seq_cst) const _NOEXCEPT
-      _LIBCPP_CHECK_LOAD_MEMORY_ORDER(__m) {
-    return std::__cxx_atomic_load(std::addressof(__a_), __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI operator _Tp() const volatile _NOEXCEPT { return load(); }
-  _LIBCPP_HIDE_FROM_ABI operator _Tp() const _NOEXCEPT { return load(); }
-  _LIBCPP_HIDE_FROM_ABI _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
-    return std::__cxx_atomic_exchange(std::addressof(__a_), __d, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
-    return std::__cxx_atomic_exchange(std::addressof(__a_), __d, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile _NOEXCEPT
-      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) {
-    return std::__cxx_atomic_compare_exchange_weak(std::addressof(__a_), std::addressof(__e), __d, __s, __f);
-  }
-  _LIBCPP_HIDE_FROM_ABI bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) _NOEXCEPT
-      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) {
-    return std::__cxx_atomic_compare_exchange_weak(std::addressof(__a_), std::addressof(__e), __d, __s, __f);
-  }
-  _LIBCPP_HIDE_FROM_ABI bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile _NOEXCEPT
-      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) {
-    return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __s, __f);
-  }
-  _LIBCPP_HIDE_FROM_ABI bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) _NOEXCEPT
-      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) {
-    return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __s, __f);
-  }
-  _LIBCPP_HIDE_FROM_ABI bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
-    return std::__cxx_atomic_compare_exchange_weak(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
-    return std::__cxx_atomic_compare_exchange_weak(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
-    return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
-    return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
-  }
-
-#if _LIBCPP_STD_VER >= 20
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const
-      volatile _NOEXCEPT {
-    std::__atomic_wait(*this, __v, __m);
-  }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-  wait(_Tp __v, memory_order __m = memory_order_seq_cst) const _NOEXCEPT {
-    std::__atomic_wait(*this, __v, __m);
-  }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT {
-    std::__atomic_notify_one(*this);
-  }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT { std::__atomic_notify_one(*this); }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() volatile _NOEXCEPT {
-    std::__atomic_notify_all(*this);
-  }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { std::__atomic_notify_all(*this); }
-#endif //  _LIBCPP_STD_VER >= 20
-
-#if _LIBCPP_STD_VER >= 20
-  _LIBCPP_HIDE_FROM_ABI constexpr __atomic_base() noexcept(is_nothrow_default_constructible_v<_Tp>) : __a_(_Tp()) {}
-#else
-  _LIBCPP_HIDE_FROM_ABI __atomic_base() _NOEXCEPT = default;
-#endif
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __atomic_base(_Tp __d) _NOEXCEPT : __a_(__d) {}
-
-  __atomic_base(const __atomic_base&) = delete;
-};
-
-// atomic<Integral>
-
-template <class _Tp>
-struct __atomic_base<_Tp, true> : public __atomic_base<_Tp, false> {
-  using __base = __atomic_base<_Tp, false>;
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __atomic_base() _NOEXCEPT = default;
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __atomic_base(_Tp __d) _NOEXCEPT : __base(__d) {}
-
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
-    return std::__cxx_atomic_fetch_add(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
-    return std::__cxx_atomic_fetch_add(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
-    return std::__cxx_atomic_fetch_sub(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
-    return std::__cxx_atomic_fetch_sub(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
-    return std::__cxx_atomic_fetch_and(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
-    return std::__cxx_atomic_fetch_and(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
-    return std::__cxx_atomic_fetch_or(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
-    return std::__cxx_atomic_fetch_or(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT {
-    return std::__cxx_atomic_fetch_xor(std::addressof(this->__a_), __op, __m);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) _NOEXCEPT {
-    return std::__cxx_atomic_fetch_xor(std::addressof(this->__a_), __op, __m);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI _Tp operator++(int) volatile _NOEXCEPT { return fetch_add(_Tp(1)); }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator++(int) _NOEXCEPT { return fetch_add(_Tp(1)); }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator--(int) volatile _NOEXCEPT { return fetch_sub(_Tp(1)); }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator--(int) _NOEXCEPT { return fetch_sub(_Tp(1)); }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator++() volatile _NOEXCEPT { return fetch_add(_Tp(1)) + _Tp(1); }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator++() _NOEXCEPT { return fetch_add(_Tp(1)) + _Tp(1); }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator--() volatile _NOEXCEPT { return fetch_sub(_Tp(1)) - _Tp(1); }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator--() _NOEXCEPT { return fetch_sub(_Tp(1)) - _Tp(1); }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator+=(_Tp __op) volatile _NOEXCEPT { return fetch_add(__op) + __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator+=(_Tp __op) _NOEXCEPT { return fetch_add(__op) + __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator-=(_Tp __op) volatile _NOEXCEPT { return fetch_sub(__op) - __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator-=(_Tp __op) _NOEXCEPT { return fetch_sub(__op) - __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator&=(_Tp __op) volatile _NOEXCEPT { return fetch_and(__op) & __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator&=(_Tp __op) _NOEXCEPT { return fetch_and(__op) & __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator|=(_Tp __op) volatile _NOEXCEPT { return fetch_or(__op) | __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator|=(_Tp __op) _NOEXCEPT { return fetch_or(__op) | __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator^=(_Tp __op) volatile _NOEXCEPT { return fetch_xor(__op) ^ __op; }
-  _LIBCPP_HIDE_FROM_ABI _Tp operator^=(_Tp __op) _NOEXCEPT { return fetch_xor(__op) ^ __op; }
-};
-
-// Here we need _IsIntegral because the default template argument is not enough
-// e.g  __atomic_base<int> is __atomic_base<int, true>, which inherits from
-// __atomic_base<int, false> and the caller of the wait function is
-// __atomic_base<int, false>. So specializing __atomic_base<_Tp> does not work
-template <class _Tp, bool _IsIntegral>
-struct __atomic_waitable_traits<__atomic_base<_Tp, _IsIntegral> > {
-  static _LIBCPP_HIDE_FROM_ABI _Tp __atomic_load(const __atomic_base<_Tp, _IsIntegral>& __a, memory_order __order) {
-    return __a.load(__order);
-  }
-
-  static _LIBCPP_HIDE_FROM_ABI _Tp
-  __atomic_load(const volatile __atomic_base<_Tp, _IsIntegral>& __this, memory_order __order) {
-    return __this.load(__order);
-  }
-
-  static _LIBCPP_HIDE_FROM_ABI const __cxx_atomic_impl<_Tp>*
-  __atomic_contention_address(const __atomic_base<_Tp, _IsIntegral>& __a) {
-    return std::addressof(__a.__a_);
-  }
-
-  static _LIBCPP_HIDE_FROM_ABI const volatile __cxx_atomic_impl<_Tp>*
-  __atomic_contention_address(const volatile __atomic_base<_Tp, _IsIntegral>& __this) {
-    return std::addressof(__this.__a_);
-  }
-};
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___ATOMIC_ATOMIC_BASE_H
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index 716d198bc236b..d4adf277c49c7 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -591,7 +591,6 @@ template <class T>
 
 #include <__atomic/aliases.h>
 #include <__atomic/atomic.h>
-#include <__atomic/atomic_base.h>
 #include <__atomic/atomic_flag.h>
 #include <__atomic/atomic_init.h>
 #include <__atomic/atomic_lock_free.h>
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index c7df0e9e6e8d4..980eae06ab140 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -50,7 +50,7 @@ namespace std
 #if _LIBCPP_HAS_THREADS
 
 #  include <__assert>
-#  include <__atomic/atomic_base.h>
+#  include <__atomic/atomic.h>
 #  include <__atomic/memory_order.h>
 #  include <__cstddef/ptrdiff_t.h>
 #  include <__memory/unique_ptr.h>
@@ -109,9 +109,9 @@ template <class _CompletionF>
 class __barrier_base {
   ptrdiff_t __expected_;
   unique_ptr<__barrier_algorithm_base, void (*)(__barrier_algorithm_base*)> __base_;
-  __atomic_base<ptrdiff_t> __expected_adjustment_;
+  atomic<ptrdiff_t> __expected_adjustment_;
   _CompletionF __completion_;
-  __atomic_base<__barrier_phase_t> __phase_;
+  atomic<__barrier_phase_t> __phase_;
 
 public:
   using arrival_token = __barrier_phase_t;
@@ -167,10 +167,10 @@ Two versions of this algorithm are provided:
 
 template <class _CompletionF>
 class __barrier_base {
-  __atomic_base<ptrdiff_t> __expected;
-  __atomic_base<ptrdiff_t> __arrived;
+  atomic<ptrdiff_t> __expected;
+  atomic<ptrdiff_t> __arrived;
   _CompletionF __completion;
-  __atomic_base<bool> __phase;
+  atomic<bool> __phase;
 
 public:
   using arrival_token = bool;
@@ -212,7 +212,7 @@ class __barrier_base<__empty_completion> {
   static constexpr uint64_t __phase_bit     = 1ull << 63;
   static constexpr uint64_t __arrived_mask  = (__phase_bit - 1) & ~__expected_mask;
 
-  __atomic_base<uint64_t> __phase_arrived_expected;
+  atomic<uint64_t> __phase_arrived_expected;
 
   static _LIBCPP_HIDE_FROM_ABI constexpr uint64_t __init(ptrdiff_t __count) _NOEXCEPT {
     return ((uint64_t(1u << 31) - __count) << 32) | (uint64_t(1u << 31) - __count);
diff --git a/libcxx/include/bit b/libcxx/include/bit
index 94387d101a398..092aebca26a31 100644
--- a/libcxx/include/bit
+++ b/libcxx/include/bit
@@ -87,10 +87,6 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
-#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17
-#  include <cstdint>
-#endif
-
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
 #  include <iosfwd>
diff --git a/libcxx/include/charconv b/libcxx/include/charconv
index 8f5e697eec439..a65b3d3527080 100644
--- a/libcxx/include/charconv
+++ b/libcxx/include/charconv
@@ -101,13 +101,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 _LIBCPP_END_NAMESPACE_STD
 
-#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14
-#  include <cerrno>
-#  include <cstddef>
-#  include <initializer_list>
-#  include <new>
-#endif
-
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cmath>
 #  include <concepts>
diff --git a/libcxx/include/compare b/libcxx/include/compare
index de0e4c7ec2280..440d4c4b4dd26 100644
--- a/libcxx/include/compare
+++ b/libcxx/include/compare
@@ -164,12 +164,6 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
-#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17
-#  include <cstddef>
-#  include <cstdint>
-#  include <limits>
-#endif
-
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cmath>
 #  include <cstddef>
diff --git a/libcxx/include/expected b/libcxx/include/expected
index 6a2f12f2bf3b5..3c7ef336432a1 100644
--- a/libcxx/include/expected
+++ b/libcxx/include/expected
@@ -53,10 +53,4 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
-#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
-#  include <cstddef>
-#  include <initializer_list>
-#  include <new>
-#endif
-
 #endif // _LIBCPP_EXPECTED
diff --git a/libcxx/include/future b/libcxx/include/future
index 9f7c95e542fd6..cbf3ed9346417 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -384,6 +384,7 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 #  include <__system_error/error_category.h>
 #  include <__system_error/error_code.h>
 #  include <__system_error/error_condition.h>
+#  include <__thread/thread.h>
 #  include <__type_traits/add_lvalue_reference.h>
 #  include <__type_traits/aligned_storage.h>
 #  include <__type_traits/conditional.h>
@@ -397,7 +398,6 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 #  include <mutex>
 #  include <new>
 #  include <stdexcept>
-#  include <thread>
 #  include <version>
 
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -2071,6 +2071,7 @@ _LIBCPP_POP_MACROS
 #  include <exception>
 #  include <iosfwd>
 #  include <system_error>
+#  include <thread>
 #endif
 
 #endif // _LIBCPP_FUTURE
diff --git a/libcxx/include/latch b/libcxx/include/latch
index 90cca27c50c37..1860ed816c856 100644
--- a/libcxx/include/latch
+++ b/libcxx/include/latch
@@ -45,7 +45,7 @@ namespace std
 #if _LIBCPP_HAS_THREADS
 
 #  include <__assert>
-#  include <__atomic/atomic_base.h>
+#  include <__atomic/atomic.h>
 #  include <__atomic/atomic_sync.h>
 #  include <__atomic/memory_order.h>
 #  include <__cstddef/ptrdiff_t.h>
@@ -64,7 +64,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 class latch {
-  __atomic_base<ptrdiff_t> __a_;
+  atomic<ptrdiff_t> __a_;
 
 public:
   static _LIBCPP_HIDE_FROM_ABI constexpr ptrdiff_t max() noexcept { return numeric_limits<ptrdiff_t>::max(); }
diff --git a/libcxx/include/mdspan b/libcxx/include/mdspan
index 29190e4a9953e..d6191a197e15c 100644
--- a/libcxx/include/mdspan
+++ b/libcxx/include/mdspan
@@ -426,13 +426,4 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
-#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
-#  include <array>
-#  include <cinttypes>
-#  include <concepts>
-#  include <cstddef>
-#  include <limits>
-#  include <span>
-#endif
-
 #endif // _LIBCPP_MDSPAN
diff --git a/libcxx/include/memory_resource b/libcxx/include/memory_resource
index e98ca20aa058c..7de69e67b7c06 100644
--- a/libcxx/include/memory_resource
+++ b/libcxx/include/memory_resource
@@ -66,15 +66,6 @@ namespace std::pmr {
 #  pragma GCC system_header
 #endif
 
-#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14
-#  include <cstddef>
-#  include <cstdint>
-#  include <limits>
-#  include <mutex>
-#  include <new>
-#  include <tuple>
-#endif
-
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <stdexcept>
 #endif
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 139c0a8366643..4e06a68c6a6b6 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -844,7 +844,6 @@ module std [system] {
 
   module atomic {
     module aliases                { header "__atomic/aliases.h" }
-    module atomic_base            { header "__atomic/atomic_base.h" }
     module atomic_flag            { header "__atomic/atomic_flag.h" }
     module atomic_init            { header "__atomic/atomic_init.h" }
     module atomic_lock_free       { header "__atomic/atomic_lock_free.h" }
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index b17a399e0ed65..d8ee6f75e8b23 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -446,14 +446,6 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
-#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17
-#  include <cstddef>
-#  include <limits>
-#  include <optional>
-#  include <span>
-#  include <tuple>
-#endif
-
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
 #  include <iosfwd>
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index 05c85bc810603..c594df459c93f 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -50,7 +50,7 @@ using binary_semaphore = counting_semaphore<1>; // since C++20
 #if _LIBCPP_HAS_THREADS
 
 #  include <__assert>
-#  include <__atomic/atomic_base.h>
+#  include <__atomic/atomic.h>
 #  include <__atomic/atomic_sync.h>
 #  include <__atomic/memory_order.h>
 #  include <__chrono/time_point.h>
@@ -83,7 +83,7 @@ functions. It avoids contention against users' own use of those facilities.
 #    define _LIBCPP_SEMAPHORE_MAX (numeric_limits<ptrdiff_t>::max())
 
 class __atomic_semaphore_base {
-  __atomic_base<ptrdiff_t> __a_;
+  atomic<ptrdiff_t> __a_;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __atomic_semaphore_base(ptrdiff_t __count) : __a_(__count) {}
diff --git a/libcxx/include/string b/libcxx/include/string
index a994f65a9a6e4..bf7fc3c37ecd7 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -3374,7 +3374,7 @@ template <class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target_capacity) {
   __annotate_delete();
-  auto __guard = std::__make_scope_guard(__annotate_new_size(*this));
+  auto __guard    = std::__make_scope_guard(__annotate_new_size(*this));
   size_type __cap = capacity();
   size_type __sz  = size();
 
diff --git a/libcxx/include/thread b/libcxx/include/thread
index bfe7e4a4c51e5..d7c3f704ad672 100644
--- a/libcxx/include/thread
+++ b/libcxx/include/thread
@@ -90,11 +90,17 @@ void sleep_for(const chrono::duration<Rep, Period>& rel_time);
 
 #if _LIBCPP_HAS_THREADS
 
-#  include <__thread/formatter.h>
-#  include <__thread/jthread.h>
-#  include <__thread/support.h>
 #  include <__thread/this_thread.h>
 #  include <__thread/thread.h>
+
+#  if _LIBCPP_STD_VER >= 20
+#    include <__thread/jthread.h>
+#  endif
+
+#  if _LIBCPP_STD_VER >= 23
+#    include <__thread/formatter.h>
+#  endif
+
 #  include <version>
 
 // standard-mandated includes
@@ -108,13 +114,6 @@ void sleep_for(const chrono::duration<Rep, Period>& rel_time);
 
 #endif // _LIBCPP_HAS_THREADS
 
-#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES)
-#  include <cstddef>
-#  include <ctime>
-#  include <iosfwd>
-#  include <ratio>
-#endif
-
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17
 #  include <chrono>
 #endif
diff --git a/libcxx/src/barrier.cpp b/libcxx/src/barrier.cpp
index 69601bfeec054..b97c7bd73b74c 100644
--- a/libcxx/src/barrier.cpp
+++ b/libcxx/src/barrier.cpp
@@ -17,7 +17,7 @@ class __barrier_algorithm_base {
 public:
   struct alignas(64) /* naturally-align the heap state */ __state_t {
     struct {
-      __atomic_base<__barrier_phase_t> __phase{0};
+      atomic<__barrier_phase_t> __phase{0};
     } __tickets[64];
   };
 
diff --git a/libcxx/test/benchmarks/CMakeLists.txt b/libcxx/test/benchmarks/CMakeLists.txt
index b5a4aae82c06a..b0fe600623d96 100644
--- a/libcxx/test/benchmarks/CMakeLists.txt
+++ b/libcxx/test/benchmarks/CMakeLists.txt
@@ -35,13 +35,14 @@ ExternalProject_Add(google-benchmark
         SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
         INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark
         CMAKE_CACHE_ARGS
-          -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-          -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+          -DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER}
+          -DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER}
+          -DCMAKE_MAKE_PROGRAM:FILEPATH=${CMAKE_MAKE_PROGRAM}
           -DCMAKE_BUILD_TYPE:STRING=RELEASE
           -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
           -DCMAKE_CXX_FLAGS:STRING=${BENCHMARK_COMPILE_FLAGS}
           -DBENCHMARK_USE_LIBCXX:BOOL=ON
           -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
-          -DBENCHMARK_CXX_LIBRARIES:STRING="${BENCHMARK_CXX_LIBRARIES}")
+          -DBENCHMARK_CXX_LIBRARIES:STRING=${BENCHMARK_CXX_LIBRARIES})
 
 add_dependencies(cxx-test-depends google-benchmark)
diff --git a/libcxx/test/benchmarks/ContainerBenchmarks.h b/libcxx/test/benchmarks/ContainerBenchmarks.h
index 742c848328604..38e11777f488b 100644
--- a/libcxx/test/benchmarks/ContainerBenchmarks.h
+++ b/libcxx/test/benchmarks/ContainerBenchmarks.h
@@ -11,6 +11,8 @@
 #define BENCHMARK_CONTAINER_BENCHMARKS_H
 
 #include <cassert>
+#include <iterator>
+#include <utility>
 
 #include "Utilities.h"
 #include "benchmark/benchmark.h"
@@ -149,6 +151,34 @@ void BM_EmplaceDuplicate(benchmark::State& st, Container c, GenInputs gen) {
   }
 }
 
+template <class Container, class GenInputs>
+void BM_erase_iter_in_middle(benchmark::State& st, Container, GenInputs gen) {
+  auto in = gen(st.range(0));
+  Container c(in.begin(), in.end());
+  assert(c.size() > 2);
+  for (auto _ : st) {
+    auto mid    = std::next(c.begin(), c.size() / 2);
+    auto tmp    = *mid;
+    auto result = c.erase(mid); // erase an element in the middle
+    benchmark::DoNotOptimize(result);
+    c.push_back(std::move(tmp)); // and then push it back at the end to avoid needing a new container
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_erase_iter_at_start(benchmark::State& st, Container, GenInputs gen) {
+  auto in = gen(st.range(0));
+  Container c(in.begin(), in.end());
+  assert(c.size() > 2);
+  for (auto _ : st) {
+    auto it     = c.begin();
+    auto tmp    = *it;
+    auto result = c.erase(it); // erase the first element
+    benchmark::DoNotOptimize(result);
+    c.push_back(std::move(tmp)); // and then push it back at the end to avoid needing a new container
+  }
+}
+
 template <class Container, class GenInputs>
 void BM_Find(benchmark::State& st, Container c, GenInputs gen) {
   auto in = gen(st.range(0));
diff --git a/libcxx/test/benchmarks/atomic_wait.bench.cpp b/libcxx/test/benchmarks/atomic_wait.bench.cpp
index 49503a318fda1..d19f5fbed8ad6 100644
--- a/libcxx/test/benchmarks/atomic_wait.bench.cpp
+++ b/libcxx/test/benchmarks/atomic_wait.bench.cpp
@@ -9,6 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <atomic>
+#include <cstdint>
 #include <numeric>
 #include <stop_token>
 #include <thread>
diff --git a/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp b/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
index 221fc086d2a62..a554c721df017 100644
--- a/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
+++ b/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 #include <atomic>
+#include <cstdint>
 #include <mutex>
-#include <numeric>
 #include <stop_token>
 #include <thread>
 
diff --git a/libcxx/test/benchmarks/deque.bench.cpp b/libcxx/test/benchmarks/deque.bench.cpp
index b8f3b76dd27ee..ab0ba96b12ffc 100644
--- a/libcxx/test/benchmarks/deque.bench.cpp
+++ b/libcxx/test/benchmarks/deque.bench.cpp
@@ -9,6 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 #include <deque>
+#include <string>
 
 #include "benchmark/benchmark.h"
 
@@ -41,4 +42,14 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_size_t, std::deque<size_t>{}, get
 BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_string, std::deque<std::string>{}, getRandomStringInputs)
     ->Arg(TestNumInputs);
 
+BENCHMARK_CAPTURE(BM_erase_iter_in_middle, deque_int, std::deque<int>{}, getRandomIntegerInputs<int>)
+    ->Range(TestNumInputs, TestNumInputs * 10);
+BENCHMARK_CAPTURE(BM_erase_iter_in_middle, deque_string, std::deque<std::string>{}, getRandomStringInputs)
+    ->Range(TestNumInputs, TestNumInputs * 10);
+
+BENCHMARK_CAPTURE(BM_erase_iter_at_start, deque_int, std::deque<int>{}, getRandomIntegerInputs<int>)
+    ->Range(TestNumInputs, TestNumInputs * 10);
+BENCHMARK_CAPTURE(BM_erase_iter_at_start, deque_string, std::deque<std::string>{}, getRandomStringInputs)
+    ->Range(TestNumInputs, TestNumInputs * 10);
+
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/stop_token.bench.cpp b/libcxx/test/benchmarks/stop_token.bench.cpp
index 6149f91c6fc38..a627f80697dd5 100644
--- a/libcxx/test/benchmarks/stop_token.bench.cpp
+++ b/libcxx/test/benchmarks/stop_token.bench.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-#include <numeric>
+#include <cstdint>
 #include <optional>
 #include <stop_token>
 #include <thread>
diff --git a/libcxx/test/benchmarks/vector_operations.bench.cpp b/libcxx/test/benchmarks/vector_operations.bench.cpp
index ce8ab233fc981..1855861263324 100644
--- a/libcxx/test/benchmarks/vector_operations.bench.cpp
+++ b/libcxx/test/benchmarks/vector_operations.bench.cpp
@@ -54,6 +54,16 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_string, std::vector<std::string>
 
 BENCHMARK_CAPTURE(BM_Pushback_no_grow, vector_int, std::vector<int>{})->Arg(TestNumInputs);
 
+BENCHMARK_CAPTURE(BM_erase_iter_in_middle, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
+    ->Range(TestNumInputs, TestNumInputs * 10);
+BENCHMARK_CAPTURE(BM_erase_iter_in_middle, vector_string, std::vector<std::string>{}, getRandomStringInputs)
+    ->Range(TestNumInputs, TestNumInputs * 10);
+
+BENCHMARK_CAPTURE(BM_erase_iter_at_start, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
+    ->Range(TestNumInputs, TestNumInputs * 10);
+BENCHMARK_CAPTURE(BM_erase_iter_at_start, vector_string, std::vector<std::string>{}, getRandomStringInputs)
+    ->Range(TestNumInputs, TestNumInputs * 10);
+
 template <class T>
 void bm_grow(benchmark::State& state) {
   for (auto _ : state) {
diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
index 44d51921ac74a..a8093ae22b38d 100644
--- a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
@@ -15,6 +15,7 @@
 #include <atomic>
 #include <cassert>
 #include <chrono>
+#include <cstdint>
 #include <thread>
 
 #include "make_test_thread.h"
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index d70541290023b..72fccfd364932 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -242,17 +242,14 @@ ccomplex utility
 ccomplex variant
 ccomplex vector
 ccomplex version
-charconv cerrno
 charconv cmath
 charconv concepts
 charconv cstddef
 charconv cstdint
 charconv cstdlib
 charconv cstring
-charconv initializer_list
 charconv iosfwd
 charconv limits
-charconv new
 charconv type_traits
 charconv version
 chrono algorithm
@@ -571,12 +568,6 @@ exception typeinfo
 exception version
 execution cstddef
 execution version
-expected cstddef
-expected cstdint
-expected cstdlib
-expected initializer_list
-expected new
-expected type_traits
 expected version
 experimental/iterator algorithm
 experimental/iterator atomic
@@ -1340,53 +1331,6 @@ map utility
 map variant
 map vector
 map version
-mdspan algorithm
-mdspan array
-mdspan atomic
-mdspan bit
-mdspan cctype
-mdspan cerrno
-mdspan cinttypes
-mdspan climits
-mdspan clocale
-mdspan cmath
-mdspan compare
-mdspan concepts
-mdspan cstdarg
-mdspan cstddef
-mdspan cstdint
-mdspan cstdio
-mdspan cstdlib
-mdspan cstring
-mdspan ctime
-mdspan cwchar
-mdspan cwctype
-mdspan exception
-mdspan functional
-mdspan initializer_list
-mdspan ios
-mdspan iosfwd
-mdspan iterator
-mdspan limits
-mdspan locale
-mdspan memory
-mdspan mutex
-mdspan new
-mdspan optional
-mdspan ratio
-mdspan span
-mdspan stdexcept
-mdspan streambuf
-mdspan string
-mdspan string_view
-mdspan system_error
-mdspan tuple
-mdspan type_traits
-mdspan typeinfo
-mdspan unordered_map
-mdspan utility
-mdspan variant
-mdspan vector
 mdspan version
 memory atomic
 memory cctype
@@ -1416,42 +1360,15 @@ memory typeinfo
 memory utility
 memory variant
 memory version
-memory_resource algorithm
-memory_resource atomic
-memory_resource bit
-memory_resource cctype
-memory_resource cerrno
-memory_resource climits
-memory_resource cmath
-memory_resource compare
-memory_resource concepts
 memory_resource cstddef
 memory_resource cstdint
-memory_resource cstdio
 memory_resource cstdlib
-memory_resource cstring
-memory_resource ctime
-memory_resource cwchar
-memory_resource cwctype
 memory_resource exception
-memory_resource initializer_list
 memory_resource iosfwd
-memory_resource iterator
-memory_resource limits
-memory_resource memory
-memory_resource mutex
 memory_resource new
-memory_resource optional
-memory_resource ratio
 memory_resource stdexcept
-memory_resource string
-memory_resource string_view
-memory_resource system_error
-memory_resource tuple
 memory_resource type_traits
 memory_resource typeinfo
-memory_resource utility
-memory_resource variant
 memory_resource version
 mutex algorithm
 mutex atomic
@@ -1772,52 +1689,28 @@ random utility
 random variant
 random vector
 random version
-ranges algorithm
-ranges array
-ranges atomic
-ranges bit
 ranges cctype
-ranges cerrno
-ranges climits
-ranges clocale
 ranges cmath
 ranges compare
 ranges concepts
-ranges cstdarg
 ranges cstddef
 ranges cstdint
 ranges cstdio
 ranges cstdlib
 ranges cstring
-ranges ctime
 ranges cwchar
 ranges cwctype
 ranges exception
-ranges functional
 ranges initializer_list
-ranges ios
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges locale
-ranges memory
-ranges mutex
 ranges new
-ranges optional
-ranges ratio
-ranges span
-ranges stdexcept
-ranges streambuf
-ranges string
-ranges string_view
-ranges system_error
 ranges tuple
 ranges type_traits
 ranges typeinfo
-ranges unordered_map
 ranges utility
 ranges variant
-ranges vector
 ranges version
 ratio climits
 ratio cstdint
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index d70541290023b..72fccfd364932 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -242,17 +242,14 @@ ccomplex utility
 ccomplex variant
 ccomplex vector
 ccomplex version
-charconv cerrno
 charconv cmath
 charconv concepts
 charconv cstddef
 charconv cstdint
 charconv cstdlib
 charconv cstring
-charconv initializer_list
 charconv iosfwd
 charconv limits
-charconv new
 charconv type_traits
 charconv version
 chrono algorithm
@@ -571,12 +568,6 @@ exception typeinfo
 exception version
 execution cstddef
 execution version
-expected cstddef
-expected cstdint
-expected cstdlib
-expected initializer_list
-expected new
-expected type_traits
 expected version
 experimental/iterator algorithm
 experimental/iterator atomic
@@ -1340,53 +1331,6 @@ map utility
 map variant
 map vector
 map version
-mdspan algorithm
-mdspan array
-mdspan atomic
-mdspan bit
-mdspan cctype
-mdspan cerrno
-mdspan cinttypes
-mdspan climits
-mdspan clocale
-mdspan cmath
-mdspan compare
-mdspan concepts
-mdspan cstdarg
-mdspan cstddef
-mdspan cstdint
-mdspan cstdio
-mdspan cstdlib
-mdspan cstring
-mdspan ctime
-mdspan cwchar
-mdspan cwctype
-mdspan exception
-mdspan functional
-mdspan initializer_list
-mdspan ios
-mdspan iosfwd
-mdspan iterator
-mdspan limits
-mdspan locale
-mdspan memory
-mdspan mutex
-mdspan new
-mdspan optional
-mdspan ratio
-mdspan span
-mdspan stdexcept
-mdspan streambuf
-mdspan string
-mdspan string_view
-mdspan system_error
-mdspan tuple
-mdspan type_traits
-mdspan typeinfo
-mdspan unordered_map
-mdspan utility
-mdspan variant
-mdspan vector
 mdspan version
 memory atomic
 memory cctype
@@ -1416,42 +1360,15 @@ memory typeinfo
 memory utility
 memory variant
 memory version
-memory_resource algorithm
-memory_resource atomic
-memory_resource bit
-memory_resource cctype
-memory_resource cerrno
-memory_resource climits
-memory_resource cmath
-memory_resource compare
-memory_resource concepts
 memory_resource cstddef
 memory_resource cstdint
-memory_resource cstdio
 memory_resource cstdlib
-memory_resource cstring
-memory_resource ctime
-memory_resource cwchar
-memory_resource cwctype
 memory_resource exception
-memory_resource initializer_list
 memory_resource iosfwd
-memory_resource iterator
-memory_resource limits
-memory_resource memory
-memory_resource mutex
 memory_resource new
-memory_resource optional
-memory_resource ratio
 memory_resource stdexcept
-memory_resource string
-memory_resource string_view
-memory_resource system_error
-memory_resource tuple
 memory_resource type_traits
 memory_resource typeinfo
-memory_resource utility
-memory_resource variant
 memory_resource version
 mutex algorithm
 mutex atomic
@@ -1772,52 +1689,28 @@ random utility
 random variant
 random vector
 random version
-ranges algorithm
-ranges array
-ranges atomic
-ranges bit
 ranges cctype
-ranges cerrno
-ranges climits
-ranges clocale
 ranges cmath
 ranges compare
 ranges concepts
-ranges cstdarg
 ranges cstddef
 ranges cstdint
 ranges cstdio
 ranges cstdlib
 ranges cstring
-ranges ctime
 ranges cwchar
 ranges cwctype
 ranges exception
-ranges functional
 ranges initializer_list
-ranges ios
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges locale
-ranges memory
-ranges mutex
 ranges new
-ranges optional
-ranges ratio
-ranges span
-ranges stdexcept
-ranges streambuf
-ranges string
-ranges string_view
-ranges system_error
 ranges tuple
 ranges type_traits
 ranges typeinfo
-ranges unordered_map
 ranges utility
 ranges variant
-ranges vector
 ranges version
 ratio climits
 ratio cstdint
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 90bff887eb278..fd36dace19c76 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -247,17 +247,14 @@ ccomplex utility
 ccomplex variant
 ccomplex vector
 ccomplex version
-charconv cerrno
 charconv cmath
 charconv concepts
 charconv cstddef
 charconv cstdint
 charconv cstdlib
 charconv cstring
-charconv initializer_list
 charconv iosfwd
 charconv limits
-charconv new
 charconv type_traits
 charconv version
 chrono algorithm
@@ -582,12 +579,6 @@ exception typeinfo
 exception version
 execution cstddef
 execution version
-expected cstddef
-expected cstdint
-expected cstdlib
-expected initializer_list
-expected new
-expected type_traits
 expected version
 experimental/iterator algorithm
 experimental/iterator atomic
@@ -1370,54 +1361,6 @@ map utility
 map variant
 map vector
 map version
-mdspan algorithm
-mdspan array
-mdspan atomic
-mdspan bit
-mdspan cctype
-mdspan cerrno
-mdspan cinttypes
-mdspan climits
-mdspan clocale
-mdspan cmath
-mdspan compare
-mdspan concepts
-mdspan cstdarg
-mdspan cstddef
-mdspan cstdint
-mdspan cstdio
-mdspan cstdlib
-mdspan cstring
-mdspan ctime
-mdspan cwchar
-mdspan cwctype
-mdspan exception
-mdspan execution
-mdspan functional
-mdspan initializer_list
-mdspan ios
-mdspan iosfwd
-mdspan iterator
-mdspan limits
-mdspan locale
-mdspan memory
-mdspan mutex
-mdspan new
-mdspan optional
-mdspan ratio
-mdspan span
-mdspan stdexcept
-mdspan streambuf
-mdspan string
-mdspan string_view
-mdspan system_error
-mdspan tuple
-mdspan type_traits
-mdspan typeinfo
-mdspan unordered_map
-mdspan utility
-mdspan variant
-mdspan vector
 mdspan version
 memory atomic
 memory cctype
@@ -1447,43 +1390,15 @@ memory typeinfo
 memory utility
 memory variant
 memory version
-memory_resource algorithm
-memory_resource atomic
-memory_resource bit
-memory_resource cctype
-memory_resource cerrno
-memory_resource climits
-memory_resource cmath
-memory_resource compare
-memory_resource concepts
 memory_resource cstddef
 memory_resource cstdint
-memory_resource cstdio
 memory_resource cstdlib
-memory_resource cstring
-memory_resource ctime
-memory_resource cwchar
-memory_resource cwctype
 memory_resource exception
-memory_resource execution
-memory_resource initializer_list
 memory_resource iosfwd
-memory_resource iterator
-memory_resource limits
-memory_resource memory
-memory_resource mutex
 memory_resource new
-memory_resource optional
-memory_resource ratio
 memory_resource stdexcept
-memory_resource string
-memory_resource string_view
-memory_resource system_error
-memory_resource tuple
 memory_resource type_traits
 memory_resource typeinfo
-memory_resource utility
-memory_resource variant
 memory_resource version
 mutex algorithm
 mutex atomic
@@ -1808,53 +1723,28 @@ random utility
 random variant
 random vector
 random version
-ranges algorithm
-ranges array
-ranges atomic
-ranges bit
 ranges cctype
-ranges cerrno
-ranges climits
-ranges clocale
 ranges cmath
 ranges compare
 ranges concepts
-ranges cstdarg
 ranges cstddef
 ranges cstdint
 ranges cstdio
 ranges cstdlib
 ranges cstring
-ranges ctime
 ranges cwchar
 ranges cwctype
 ranges exception
-ranges execution
-ranges functional
 ranges initializer_list
-ranges ios
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges locale
-ranges memory
-ranges mutex
 ranges new
-ranges optional
-ranges ratio
-ranges span
-ranges stdexcept
-ranges streambuf
-ranges string
-ranges string_view
-ranges system_error
 ranges tuple
 ranges type_traits
 ranges typeinfo
-ranges unordered_map
 ranges utility
 ranges variant
-ranges vector
 ranges version
 ratio climits
 ratio cstdint
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 2f908e7f78ec1..eaec25f81e582 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -571,12 +571,6 @@ exception typeinfo
 exception version
 execution cstddef
 execution version
-expected cstddef
-expected cstdint
-expected cstdlib
-expected initializer_list
-expected new
-expected type_traits
 expected version
 experimental/iterator algorithm
 experimental/iterator atomic
@@ -1364,53 +1358,6 @@ map utility
 map variant
 map vector
 map version
-mdspan algorithm
-mdspan array
-mdspan atomic
-mdspan bit
-mdspan cctype
-mdspan cerrno
-mdspan cinttypes
-mdspan climits
-mdspan clocale
-mdspan cmath
-mdspan compare
-mdspan concepts
-mdspan cstdarg
-mdspan cstddef
-mdspan cstdint
-mdspan cstdio
-mdspan cstdlib
-mdspan cstring
-mdspan ctime
-mdspan cwchar
-mdspan cwctype
-mdspan exception
-mdspan functional
-mdspan initializer_list
-mdspan ios
-mdspan iosfwd
-mdspan iterator
-mdspan limits
-mdspan locale
-mdspan memory
-mdspan mutex
-mdspan new
-mdspan optional
-mdspan ratio
-mdspan span
-mdspan stdexcept
-mdspan streambuf
-mdspan string
-mdspan string_view
-mdspan system_error
-mdspan tuple
-mdspan type_traits
-mdspan typeinfo
-mdspan unordered_map
-mdspan utility
-mdspan variant
-mdspan vector
 mdspan version
 memory atomic
 memory cctype
@@ -1796,52 +1743,28 @@ random utility
 random variant
 random vector
 random version
-ranges algorithm
-ranges array
-ranges atomic
-ranges bit
 ranges cctype
-ranges cerrno
-ranges climits
-ranges clocale
 ranges cmath
 ranges compare
 ranges concepts
-ranges cstdarg
 ranges cstddef
 ranges cstdint
 ranges cstdio
 ranges cstdlib
 ranges cstring
-ranges ctime
 ranges cwchar
 ranges cwctype
 ranges exception
-ranges functional
 ranges initializer_list
-ranges ios
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges locale
-ranges memory
-ranges mutex
 ranges new
-ranges optional
-ranges ratio
-ranges span
-ranges stdexcept
-ranges streambuf
-ranges string
-ranges string_view
-ranges system_error
 ranges tuple
 ranges type_traits
 ranges typeinfo
-ranges unordered_map
 ranges utility
 ranges variant
-ranges vector
 ranges version
 ratio climits
 ratio cstdint
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 1a198aa4562fd..89c28e49d6c9d 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -567,12 +567,6 @@ exception typeinfo
 exception version
 execution cstddef
 execution version
-expected cstddef
-expected cstdint
-expected cstdlib
-expected initializer_list
-expected new
-expected type_traits
 expected version
 experimental/iterator algorithm
 experimental/iterator atomic
@@ -1358,53 +1352,6 @@ map utility
 map variant
 map vector
 map version
-mdspan algorithm
-mdspan array
-mdspan atomic
-mdspan bit
-mdspan cctype
-mdspan cerrno
-mdspan cinttypes
-mdspan climits
-mdspan clocale
-mdspan cmath
-mdspan compare
-mdspan concepts
-mdspan cstdarg
-mdspan cstddef
-mdspan cstdint
-mdspan cstdio
-mdspan cstdlib
-mdspan cstring
-mdspan ctime
-mdspan cwchar
-mdspan cwctype
-mdspan exception
-mdspan functional
-mdspan initializer_list
-mdspan ios
-mdspan iosfwd
-mdspan iterator
-mdspan limits
-mdspan locale
-mdspan memory
-mdspan mutex
-mdspan new
-mdspan optional
-mdspan ratio
-mdspan span
-mdspan stdexcept
-mdspan streambuf
-mdspan string
-mdspan string_view
-mdspan system_error
-mdspan tuple
-mdspan type_traits
-mdspan typeinfo
-mdspan unordered_map
-mdspan utility
-mdspan variant
-mdspan vector
 mdspan version
 memory atomic
 memory cctype
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 791aad29710b5..a008b4d76edde 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -445,8 +445,6 @@ functional tuple
 functional typeinfo
 functional unordered_map
 functional version
-future array
-future atomic
 future bitset
 future cctype
 future cerrno
@@ -475,7 +473,6 @@ future stdexcept
 future streambuf
 future string
 future string_view
-future thread
 future tuple
 future typeinfo
 future version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 78c457a22c31d..d5321da32b3d4 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -444,8 +444,6 @@ functional tuple
 functional typeinfo
 functional unordered_map
 functional version
-future array
-future atomic
 future bitset
 future cctype
 future cerrno
@@ -474,7 +472,6 @@ future stdexcept
 future streambuf
 future string
 future string_view
-future thread
 future tuple
 future typeinfo
 future version
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/common.h b/libcxx/test/std/containers/sequences/vector/vector.modifiers/common.h
new file mode 100644
index 0000000000000..72cd47a50b2c0
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector/vector.modifiers/common.h
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_CONTAINERS_SEQUENCES_VECTOR_VECTOR_MODIFIERS_COMMON_H
+#define TEST_STD_CONTAINERS_SEQUENCES_VECTOR_VECTOR_MODIFIERS_COMMON_H
+
+#include "test_macros.h"
+
+#include <type_traits> // for __libcpp_is_trivially_relocatable
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+struct Throws {
+  Throws() : v_(0) {}
+  Throws(int v) : v_(v) {}
+  Throws(const Throws& rhs) : v_(rhs.v_) {
+    if (sThrows)
+      throw 1;
+  }
+  Throws(Throws&& rhs) : v_(rhs.v_) {
+    if (sThrows)
+      throw 1;
+  }
+  Throws& operator=(const Throws& rhs) {
+    v_ = rhs.v_;
+    return *this;
+  }
+  Throws& operator=(Throws&& rhs) {
+    v_ = rhs.v_;
+    return *this;
+  }
+  int v_;
+  static bool sThrows;
+};
+
+bool Throws::sThrows = false;
+#endif
+
+struct Tracker {
+  int copy_assignments = 0;
+  int move_assignments = 0;
+};
+
+struct TrackedAssignment {
+  Tracker* tracker_;
+  TEST_CONSTEXPR_CXX14 explicit TrackedAssignment(Tracker* tracker) : tracker_(tracker) {}
+
+  TrackedAssignment(TrackedAssignment const&) = default;
+  TrackedAssignment(TrackedAssignment&&)      = default;
+
+  TEST_CONSTEXPR_CXX14 TrackedAssignment& operator=(TrackedAssignment const&) {
+    tracker_->copy_assignments++;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX14 TrackedAssignment& operator=(TrackedAssignment&&) {
+    tracker_->move_assignments++;
+    return *this;
+  }
+};
+
+struct NonTriviallyRelocatable {
+  int value_;
+  TEST_CONSTEXPR NonTriviallyRelocatable() : value_(0) {}
+  TEST_CONSTEXPR explicit NonTriviallyRelocatable(int v) : value_(v) {}
+  TEST_CONSTEXPR NonTriviallyRelocatable(NonTriviallyRelocatable const& other) : value_(other.value_) {}
+  TEST_CONSTEXPR NonTriviallyRelocatable(NonTriviallyRelocatable&& other) : value_(other.value_) {}
+  TEST_CONSTEXPR_CXX14 NonTriviallyRelocatable& operator=(NonTriviallyRelocatable const& other) {
+    value_ = other.value_;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX14 NonTriviallyRelocatable& operator=(NonTriviallyRelocatable&& other) {
+    value_ = other.value_;
+    return *this;
+  }
+
+  TEST_CONSTEXPR_CXX14 friend bool operator==(NonTriviallyRelocatable const& a, NonTriviallyRelocatable const& b) {
+    return a.value_ == b.value_;
+  }
+};
+LIBCPP_STATIC_ASSERT(!std::__libcpp_is_trivially_relocatable<NonTriviallyRelocatable>::value, "");
+
+#endif // TEST_STD_CONTAINERS_SEQUENCES_VECTOR_VECTOR_MODIFIERS_COMMON_H
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.pass.cpp
index 549f29a8f7ba1..f0157eb74b90f 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.pass.cpp
@@ -11,135 +11,79 @@
 // iterator erase(const_iterator position);
 
 #include <vector>
-#include <iterator>
 #include <cassert>
+#include <memory>
 
 #include "asan_testing.h"
+#include "common.h"
 #include "min_allocator.h"
 #include "MoveOnly.h"
 #include "test_macros.h"
 
-#ifndef TEST_HAS_NO_EXCEPTIONS
-struct Throws {
-  Throws() : v_(0) {}
-  Throws(int v) : v_(v) {}
-  Throws(const Throws& rhs) : v_(rhs.v_) {
-    if (sThrows)
-      throw 1;
-  }
-  Throws(Throws&& rhs) : v_(rhs.v_) {
-    if (sThrows)
-      throw 1;
-  }
-  Throws& operator=(const Throws& rhs) {
-    v_ = rhs.v_;
-    return *this;
-  }
-  Throws& operator=(Throws&& rhs) {
-    v_ = rhs.v_;
-    return *this;
-  }
-  int v_;
-  static bool sThrows;
-};
-
-bool Throws::sThrows = false;
-#endif
-
-TEST_CONSTEXPR_CXX20 bool tests() {
-  {
-    int a1[] = {1, 2, 3, 4, 5};
-    std::vector<int> l1(a1, a1 + 5);
-    l1.erase(l1.begin());
-    assert(is_contiguous_container_asan_correct(l1));
-    assert(l1 == std::vector<int>(a1 + 1, a1 + 5));
-  }
+template <template <class> class Allocator, class T>
+TEST_CONSTEXPR_CXX20 void tests() {
   {
-    int a1[] = {1, 2, 3, 4, 5};
-    int e1[] = {1, 3, 4, 5};
-    std::vector<int> l1(a1, a1 + 5);
-    l1.erase(l1.begin() + 1);
-    assert(is_contiguous_container_asan_correct(l1));
-    assert(l1 == std::vector<int>(e1, e1 + 4));
-  }
-  {
-    int a1[] = {1, 2, 3};
-    std::vector<int> l1(a1, a1 + 3);
-    std::vector<int>::const_iterator i = l1.begin();
-    assert(is_contiguous_container_asan_correct(l1));
-    ++i;
-    std::vector<int>::iterator j = l1.erase(i);
-    assert(l1.size() == 2);
-    assert(std::distance(l1.begin(), l1.end()) == 2);
-    assert(*j == 3);
-    assert(*l1.begin() == 1);
-    assert(*std::next(l1.begin()) == 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    j = l1.erase(j);
-    assert(j == l1.end());
-    assert(l1.size() == 1);
-    assert(std::distance(l1.begin(), l1.end()) == 1);
-    assert(*l1.begin() == 1);
-    assert(is_contiguous_container_asan_correct(l1));
-    j = l1.erase(l1.begin());
-    assert(j == l1.end());
-    assert(l1.size() == 0);
-    assert(std::distance(l1.begin(), l1.end()) == 0);
-    assert(is_contiguous_container_asan_correct(l1));
-  }
+    T arr[]             = {T(1), T(2), T(3), T(4), T(5)};
+    using Vector        = std::vector<T, Allocator<T> >;
+    using Iterator      = typename Vector::iterator;
 
-  // Make sure vector::erase works with move-only types
-  // When non-trivial
-  {
-    std::vector<MoveOnly> v;
-    v.emplace_back(1);
-    v.emplace_back(2);
-    v.emplace_back(3);
-    v.erase(v.begin());
-    assert(v.size() == 2);
-    assert(v[0] == MoveOnly(2));
-    assert(v[1] == MoveOnly(3));
-  }
-  // When trivial
-  {
-    std::vector<TrivialMoveOnly> v;
-    v.emplace_back(1);
-    v.emplace_back(2);
-    v.emplace_back(3);
-    v.erase(v.begin());
-    assert(v.size() == 2);
-    assert(v[0] == TrivialMoveOnly(2));
-    assert(v[1] == TrivialMoveOnly(3));
+    {
+      Vector v(arr, arr + 5);
+      Iterator it = v.erase(v.cbegin());
+      assert(v == Vector(arr + 1, arr + 5));
+      assert(it == v.begin());
+      assert(is_contiguous_container_asan_correct(v));
+    }
+    {
+      T expected[] = {T(1), T(3), T(4), T(5)};
+      Vector v(arr, arr + 5);
+      Iterator it = v.erase(v.cbegin() + 1);
+      assert(v == Vector(expected, expected + 4));
+      assert(it == v.begin() + 1);
+      assert(is_contiguous_container_asan_correct(v));
+    }
+    {
+      T expected[] = {T(1), T(2), T(3), T(4)};
+      Vector v(arr, arr + 5);
+      Iterator it = v.erase(v.cbegin() + 4);
+      assert(v == Vector(expected, expected + 4));
+      assert(it == v.end());
+      assert(is_contiguous_container_asan_correct(v));
+    }
   }
 
-#if TEST_STD_VER >= 11
+  // Make sure vector::erase works with move-only types
   {
-    int a1[] = {1, 2, 3};
-    std::vector<int, min_allocator<int>> l1(a1, a1 + 3);
-    std::vector<int, min_allocator<int>>::const_iterator i = l1.begin();
-    assert(is_contiguous_container_asan_correct(l1));
-    ++i;
-    std::vector<int, min_allocator<int>>::iterator j = l1.erase(i);
-    assert(l1.size() == 2);
-    assert(std::distance(l1.begin(), l1.end()) == 2);
-    assert(*j == 3);
-    assert(*l1.begin() == 1);
-    assert(*std::next(l1.begin()) == 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    j = l1.erase(j);
-    assert(j == l1.end());
-    assert(l1.size() == 1);
-    assert(std::distance(l1.begin(), l1.end()) == 1);
-    assert(*l1.begin() == 1);
-    assert(is_contiguous_container_asan_correct(l1));
-    j = l1.erase(l1.begin());
-    assert(j == l1.end());
-    assert(l1.size() == 0);
-    assert(std::distance(l1.begin(), l1.end()) == 0);
-    assert(is_contiguous_container_asan_correct(l1));
+    // When non-trivial
+    {
+      std::vector<MoveOnly, Allocator<MoveOnly> > v;
+      v.emplace_back(1);
+      v.emplace_back(2);
+      v.emplace_back(3);
+      v.erase(v.begin());
+      assert(v.size() == 2);
+      assert(v[0] == MoveOnly(2));
+      assert(v[1] == MoveOnly(3));
+    }
+    // When trivial
+    {
+      std::vector<TrivialMoveOnly, Allocator<TrivialMoveOnly> > v;
+      v.emplace_back(1);
+      v.emplace_back(2);
+      v.emplace_back(3);
+      v.erase(v.begin());
+      assert(v.size() == 2);
+      assert(v[0] == TrivialMoveOnly(2));
+      assert(v[1] == TrivialMoveOnly(3));
+    }
   }
-#endif
+}
 
+TEST_CONSTEXPR_CXX20 bool tests() {
+  tests<std::allocator, int>();
+  tests<std::allocator, NonTriviallyRelocatable>();
+  tests<min_allocator, int>();
+  tests<min_allocator, NonTriviallyRelocatable>();
   return true;
 }
 
@@ -163,5 +107,31 @@ int main(int, char**) {
   }
 #endif
 
+  // Make sure we satisfy the complexity requirement in terms of the number of times the assignment
+  // operator is called.
+  //
+  // There is currently ambiguity as to whether this is truly mandated by the Standard, so we only
+  // test it for libc++.
+#ifdef _LIBCPP_VERSION
+  {
+    Tracker tracker;
+    std::vector<TrackedAssignment> v;
+
+    // Set up the vector with 5 elements.
+    for (int i = 0; i != 5; ++i) {
+      v.emplace_back(&tracker);
+    }
+    assert(tracker.copy_assignments == 0);
+    assert(tracker.move_assignments == 0);
+
+    // Erase element [1] from it. Elements [2] [3] [4] should be shifted, so we should
+    // see 3 move assignments (and nothing else).
+    v.erase(v.begin() + 1);
+    assert(v.size() == 4);
+    assert(tracker.copy_assignments == 0);
+    assert(tracker.move_assignments == 3);
+  }
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter_iter.pass.cpp
index 4091e71d814e3..104dfb4cb07d4 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter_iter.pass.cpp
@@ -11,86 +11,107 @@
 // iterator erase(const_iterator first, const_iterator last);
 
 #include <vector>
-#include <iterator>
 #include <cassert>
+#include <memory>
+#include <string>
 
 #include "asan_testing.h"
+#include "common.h"
 #include "min_allocator.h"
 #include "MoveOnly.h"
 #include "test_macros.h"
 
-#ifndef TEST_HAS_NO_EXCEPTIONS
-struct Throws {
-  Throws() : v_(0) {}
-  Throws(int v) : v_(v) {}
-  Throws(const Throws& rhs) : v_(rhs.v_) {
-    if (sThrows)
-      throw 1;
-  }
-  Throws(Throws&& rhs) : v_(rhs.v_) {
-    if (sThrows)
-      throw 1;
-  }
-  Throws& operator=(const Throws& rhs) {
-    v_ = rhs.v_;
-    return *this;
-  }
-  Throws& operator=(Throws&& rhs) {
-    v_ = rhs.v_;
-    return *this;
-  }
-  int v_;
-  static bool sThrows;
-};
+template <template <class> class Allocator, class T>
+TEST_CONSTEXPR_CXX20 void tests() {
+  {
+    T arr[]             = {T(1), T(2), T(3)};
+    using Vector        = std::vector<T, Allocator<T> >;
+    using Iterator      = typename Vector::iterator;
+    using ConstIterator = typename Vector::const_iterator;
 
-bool Throws::sThrows = false;
-#endif
+    // Erase an empty range [first, last): last should be returned
+    {
+      {
+        Vector v;
+        Iterator i = v.erase(v.end(), v.end());
+        assert(v.empty());
+        assert(i == v.end());
+        assert(is_contiguous_container_asan_correct(v));
+      }
+      {
+        Vector v(arr, arr + 3);
+        ConstIterator first = v.cbegin(), last = v.cbegin();
+        Iterator i = v.erase(first, last);
+        assert(v == Vector(arr, arr + 3));
+        assert(i == last);
+        assert(is_contiguous_container_asan_correct(v));
+      }
+      {
+        Vector v(arr, arr + 3);
+        ConstIterator first = v.cbegin() + 1, last = v.cbegin() + 1;
+        Iterator i = v.erase(first, last);
+        assert(v == Vector(arr, arr + 3));
+        assert(i == last);
+        assert(is_contiguous_container_asan_correct(v));
+      }
+      {
+        Vector v(arr, arr + 3);
+        ConstIterator first = v.cbegin(), last = v.cbegin();
+        Iterator i = v.erase(first, last);
+        assert(v == Vector(arr, arr + 3));
+        assert(i == last);
+        assert(is_contiguous_container_asan_correct(v));
+      }
+    }
 
-TEST_CONSTEXPR_CXX20 bool tests() {
-  int a1[] = {1, 2, 3};
-  {
-    std::vector<int> l1(a1, a1 + 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    std::vector<int>::iterator i = l1.erase(l1.cbegin(), l1.cbegin());
-    assert(l1.size() == 3);
-    assert(std::distance(l1.cbegin(), l1.cend()) == 3);
-    assert(i == l1.begin());
-    assert(is_contiguous_container_asan_correct(l1));
-  }
-  {
-    std::vector<int> l1(a1, a1 + 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    std::vector<int>::iterator i = l1.erase(l1.cbegin(), std::next(l1.cbegin()));
-    assert(l1.size() == 2);
-    assert(std::distance(l1.cbegin(), l1.cend()) == 2);
-    assert(i == l1.begin());
-    assert(l1 == std::vector<int>(a1 + 1, a1 + 3));
-    assert(is_contiguous_container_asan_correct(l1));
-  }
-  {
-    std::vector<int> l1(a1, a1 + 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    std::vector<int>::iterator i = l1.erase(l1.cbegin(), std::next(l1.cbegin(), 2));
-    assert(l1.size() == 1);
-    assert(std::distance(l1.cbegin(), l1.cend()) == 1);
-    assert(i == l1.begin());
-    assert(l1 == std::vector<int>(a1 + 2, a1 + 3));
-    assert(is_contiguous_container_asan_correct(l1));
-  }
-  {
-    std::vector<int> l1(a1, a1 + 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    std::vector<int>::iterator i = l1.erase(l1.cbegin(), std::next(l1.cbegin(), 3));
-    assert(l1.size() == 0);
-    assert(std::distance(l1.cbegin(), l1.cend()) == 0);
-    assert(i == l1.begin());
-    assert(is_contiguous_container_asan_correct(l1));
+    // Erase non-empty ranges
+    {
+      // Starting at begin()
+      {
+        {
+          Vector v(arr, arr + 3);
+          Iterator i = v.erase(v.cbegin(), v.cbegin() + 1);
+          assert(v == Vector(arr + 1, arr + 3));
+          assert(i == v.begin());
+          assert(is_contiguous_container_asan_correct(v));
+        }
+        {
+          Vector v(arr, arr + 3);
+          Iterator i = v.erase(v.cbegin(), v.cbegin() + 2);
+          assert(v == Vector(arr + 2, arr + 3));
+          assert(i == v.begin());
+          assert(is_contiguous_container_asan_correct(v));
+        }
+        {
+          Vector v(arr, arr + 3);
+          Iterator i = v.erase(v.cbegin(), v.end());
+          assert(v.size() == 0);
+          assert(i == v.begin());
+          assert(is_contiguous_container_asan_correct(v));
+        }
+      }
+      {
+        Vector v(arr, arr + 3);
+        Iterator i = v.erase(v.cbegin() + 1, v.cbegin() + 2);
+        assert(v.size() == 2);
+        assert(v[0] == arr[0]);
+        assert(v[1] == arr[2]);
+        assert(i == v.begin() + 1);
+        assert(is_contiguous_container_asan_correct(v));
+      }
+      {
+        Vector v(arr, arr + 3);
+        Iterator i = v.erase(v.cbegin() + 1, v.cend());
+        assert(v == Vector(arr, arr + 1));
+        assert(i == v.begin() + 1);
+        assert(is_contiguous_container_asan_correct(v));
+      }
+    }
   }
   {
-    std::vector<std::vector<int> > outer(2, std::vector<int>(1));
-    assert(is_contiguous_container_asan_correct(outer));
-    assert(is_contiguous_container_asan_correct(outer[0]));
-    assert(is_contiguous_container_asan_correct(outer[1]));
+    using InnerVector = std::vector<T, Allocator<T> >;
+    using Vector      = std::vector<InnerVector, Allocator<InnerVector> >;
+    Vector outer(2, InnerVector(1));
     outer.erase(outer.begin(), outer.begin());
     assert(outer.size() == 2);
     assert(outer[0].size() == 1);
@@ -99,11 +120,12 @@ TEST_CONSTEXPR_CXX20 bool tests() {
     assert(is_contiguous_container_asan_correct(outer[0]));
     assert(is_contiguous_container_asan_correct(outer[1]));
   }
+
   // Make sure vector::erase works with move-only types
   {
     // When non-trivial
     {
-      std::vector<MoveOnly> v;
+      std::vector<MoveOnly, Allocator<MoveOnly> > v;
       v.emplace_back(1);
       v.emplace_back(2);
       v.emplace_back(3);
@@ -113,7 +135,7 @@ TEST_CONSTEXPR_CXX20 bool tests() {
     }
     // When trivial
     {
-      std::vector<TrivialMoveOnly> v;
+      std::vector<TrivialMoveOnly, Allocator<TrivialMoveOnly> > v;
       v.emplace_back(1);
       v.emplace_back(2);
       v.emplace_back(3);
@@ -122,67 +144,19 @@ TEST_CONSTEXPR_CXX20 bool tests() {
       assert(v[0] == TrivialMoveOnly(3));
     }
   }
-#if TEST_STD_VER >= 11
-  {
-    std::vector<int, min_allocator<int>> l1(a1, a1 + 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    std::vector<int, min_allocator<int>>::iterator i = l1.erase(l1.cbegin(), l1.cbegin());
-    assert(l1.size() == 3);
-    assert(std::distance(l1.cbegin(), l1.cend()) == 3);
-    assert(i == l1.begin());
-    assert(is_contiguous_container_asan_correct(l1));
-  }
-  {
-    std::vector<int, min_allocator<int>> l1(a1, a1 + 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    std::vector<int, min_allocator<int>>::iterator i = l1.erase(l1.cbegin(), std::next(l1.cbegin()));
-    assert(l1.size() == 2);
-    assert(std::distance(l1.cbegin(), l1.cend()) == 2);
-    assert(i == l1.begin());
-    assert((l1 == std::vector<int, min_allocator<int>>(a1 + 1, a1 + 3)));
-    assert(is_contiguous_container_asan_correct(l1));
-  }
-  {
-    std::vector<int, min_allocator<int>> l1(a1, a1 + 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    std::vector<int, min_allocator<int>>::iterator i = l1.erase(l1.cbegin(), std::next(l1.cbegin(), 2));
-    assert(l1.size() == 1);
-    assert(std::distance(l1.cbegin(), l1.cend()) == 1);
-    assert(i == l1.begin());
-    assert((l1 == std::vector<int, min_allocator<int>>(a1 + 2, a1 + 3)));
-    assert(is_contiguous_container_asan_correct(l1));
-  }
-  {
-    std::vector<int, min_allocator<int>> l1(a1, a1 + 3);
-    assert(is_contiguous_container_asan_correct(l1));
-    std::vector<int, min_allocator<int>>::iterator i = l1.erase(l1.cbegin(), std::next(l1.cbegin(), 3));
-    assert(l1.size() == 0);
-    assert(std::distance(l1.cbegin(), l1.cend()) == 0);
-    assert(i == l1.begin());
-    assert(is_contiguous_container_asan_correct(l1));
-  }
-  {
-    std::vector<std::vector<int, min_allocator<int>>, min_allocator<std::vector<int, min_allocator<int>>>> outer(
-        2, std::vector<int, min_allocator<int>>(1));
-    assert(is_contiguous_container_asan_correct(outer));
-    assert(is_contiguous_container_asan_correct(outer[0]));
-    assert(is_contiguous_container_asan_correct(outer[1]));
-    outer.erase(outer.begin(), outer.begin());
-    assert(outer.size() == 2);
-    assert(outer[0].size() == 1);
-    assert(outer[1].size() == 1);
-    assert(is_contiguous_container_asan_correct(outer));
-    assert(is_contiguous_container_asan_correct(outer[0]));
-    assert(is_contiguous_container_asan_correct(outer[1]));
-  }
-#endif
+}
 
+TEST_CONSTEXPR_CXX20 bool tests() {
+  tests<std::allocator, int>();
+  tests<std::allocator, NonTriviallyRelocatable>();
+  tests<min_allocator, int>();
+  tests<min_allocator, NonTriviallyRelocatable>();
   return true;
 }
 
 int main(int, char**) {
   tests();
-#if TEST_STD_VER > 17
+#if TEST_STD_VER >= 20
   static_assert(tests());
 #endif
 
@@ -200,5 +174,53 @@ int main(int, char**) {
   }
 #endif
 
+  // Real world example with std::string, mostly intended to test trivial relocation
+  {
+    std::vector<std::string> v;
+
+    // fill the vector with half short string and half long strings
+    std::string short_string = "short";
+    std::string long_string(256, 'x');
+    for (int i = 0; i != 10; ++i) {
+      v.push_back(i % 2 == 0 ? short_string : long_string);
+    }
+
+    std::vector<std::string> original = v;
+
+    auto it = v.erase(v.cbegin() + 2, v.cbegin() + 8);
+    assert(v.size() == 4);
+    assert(v[0] == original[0]);
+    assert(v[1] == original[1]);
+    assert(v[2] == original[8]);
+    assert(v[3] == original[9]);
+    assert(it == v.begin() + 2);
+  }
+
+  // Make sure we satisfy the complexity requirement in terms of the number of times the assignment
+  // operator is called.
+  //
+  // There is currently ambiguity as to whether this is truly mandated by the Standard, so we only
+  // test it for libc++.
+#ifdef _LIBCPP_VERSION
+  {
+    Tracker tracker;
+    std::vector<TrackedAssignment> v;
+
+    // Set up the vector with 5 elements.
+    for (int i = 0; i != 5; ++i) {
+      v.emplace_back(&tracker);
+    }
+    assert(tracker.copy_assignments == 0);
+    assert(tracker.move_assignments == 0);
+
+    // Erase elements [1] and [2] from it. Elements [3] [4] should be shifted, so we should
+    // see 2 move assignments (and nothing else).
+    v.erase(v.begin() + 1, v.begin() + 3);
+    assert(v.size() == 3);
+    assert(tracker.copy_assignments == 0);
+    assert(tracker.move_assignments == 2);
+  }
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
index 85b641322d99e..52c4e3e0f69b6 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
@@ -11,6 +11,10 @@
 // These compiler versions don't enable sized deallocation by default.
 // UNSUPPORTED: clang-17, clang-18
 
+// Android clang-r536225 identifies as clang-19.0 but it predates the real
+// LLVM 19.0.0, so it also leaves sized deallocation off by default.
+// UNSUPPORTED: android && clang-19.0
+
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11
 // XFAIL: apple-clang
 // XFAIL: using-built-library-before-llvm-11
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
index ae614a1432f7d..e00339761ec24 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
@@ -11,6 +11,10 @@
 // These compiler versions do not enable sized deallocation by default.
 // UNSUPPORTED: clang-17, clang-18
 
+// Android clang-r536225 identifies as clang-19.0 but it predates the real
+// LLVM 19.0.0, so it also leaves sized deallocation off by default.
+// UNSUPPORTED: android && clang-19.0
+
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11
 // XFAIL: apple-clang
 // XFAIL: using-built-library-before-llvm-11
diff --git a/libcxx/test/std/thread/futures/futures.async/async.pass.cpp b/libcxx/test/std/thread/futures/futures.async/async.pass.cpp
index 7e0d82f0d6589..109372b50a311 100644
--- a/libcxx/test/std/thread/futures/futures.async/async.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.async/async.pass.cpp
@@ -27,6 +27,7 @@
 #include <chrono>
 #include <future>
 #include <memory>
+#include <thread>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
index bf6b0f05e64f0..5d54075812560 100644
--- a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
@@ -11,10 +11,8 @@
 
 // <semaphore>
 
+#include <cstddef>
 #include <semaphore>
-#include <thread>
-
-#include "test_macros.h"
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.cat/is_array.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.cat/is_array.pass.cpp
index 26a469a30515a..f48c2ccb59151 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.cat/is_array.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.cat/is_array.pass.cpp
@@ -73,7 +73,15 @@ typedef void (*FunctionPtr)();
 int main(int, char**)
 {
     test_is_array<char[3]>();
+    // Android clang-r536225 identifies as clang-19.0, but it predates the
+    // LLVM 19.0.0 release. It lacks llvm.org/pr86652, which changed __is_array
+    // to return false for T[0]. llvm.org/pr93037 relies on that change for
+    // correct handling of std::is_array<T[0]>. This test will pass as long as
+    // Clang and libc++ come from the same LLVM commit, but we can't detect that
+    // here.
+#if !defined(__ANDROID__) || TEST_CLANG_VER != 1900
     test_is_not_array<char[0]>();
+#endif
     test_is_array<char[]>();
     test_is_array<Union[]>();
 
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.default.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.default.pass.cpp
index dc05a2b488594..9c1adf5937cc0 100644
--- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.default.pass.cpp
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.default.pass.cpp
@@ -15,7 +15,6 @@
 // NOTE: The SFINAE on the default constructor is tested in
 //       default-sfinae.pass.cpp
 
-
 #include <utility>
 #include <type_traits>
 #include <cassert>
@@ -23,33 +22,35 @@
 #include "test_macros.h"
 #include "archetypes.h"
 
-int main(int, char**)
-{
-    {
-        typedef std::pair<float, short*> P;
-        P p;
-        assert(p.first == 0.0f);
-        assert(p.second == nullptr);
-    }
+int main(int, char**) {
+  {
+    typedef std::pair<float, short*> P;
+    P p;
+    assert(p.first == 0.0f);
+    assert(p.second == nullptr);
+  }
 #if TEST_STD_VER >= 11
-    {
-        typedef std::pair<float, short*> P;
-        constexpr P p;
-        static_assert(p.first == 0.0f, "");
-        static_assert(p.second == nullptr, "");
-    }
-    {
-        using NoDefault = ImplicitTypes::NoDefault;
-        using P = std::pair<int, NoDefault>;
-        static_assert(!std::is_default_constructible<P>::value, "");
-        using P2 = std::pair<NoDefault, int>;
-        static_assert(!std::is_default_constructible<P2>::value, "");
-    }
-    {
-        struct Base { };
-        struct Derived : Base { protected: Derived() = default; };
-        static_assert(!std::is_default_constructible<std::pair<Derived, int> >::value, "");
-    }
+  {
+    typedef std::pair<float, short*> P;
+    constexpr P p;
+    static_assert(p.first == 0.0f, "");
+    static_assert(p.second == nullptr, "");
+  }
+  {
+    using NoDefault = ImplicitTypes::NoDefault;
+    using P         = std::pair<int, NoDefault>;
+    static_assert(!std::is_default_constructible<P>::value, "");
+    using P2 = std::pair<NoDefault, int>;
+    static_assert(!std::is_default_constructible<P2>::value, "");
+  }
+  {
+    struct Base {};
+    struct Derived : Base {
+    protected:
+      Derived() = default;
+    };
+    static_assert(!std::is_default_constructible<std::pair<Derived, int> >::value, "");
+  }
 #endif
 
   return 0;
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index 487423a1c109c..1e5d35a7fecdb 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -21,7 +21,7 @@ services:
       target: android-buildkite-builder
       args:
         BASE_IMAGE: ubuntu:noble
-        ANDROID_CLANG_VERSION: r498229b
-        ANDROID_CLANG_PREBUILTS_COMMIT: 5186d132c99aa75dc25207c392e3ea5b93d0107e
-        ANDROID_SYSROOT_BID: 10957860
+        ANDROID_CLANG_VERSION: r536225
+        ANDROID_CLANG_PREBUILTS_COMMIT: 3f67b93ee7a50ae2a3cb34cc32d0589415cc0a9c
+        ANDROID_SYSROOT_BID: 12644632
         <<: *compiler_versions
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 2d20b094888c7..0f6a40a41ca00 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -131,7 +131,14 @@ class ImportDirectoryChunk : public NonSectionChunk {
 // Contents of this chunk is always null bytes.
 class NullChunk : public NonSectionChunk {
 public:
-  explicit NullChunk(size_t n) : size(n) { hasData = false; }
+  explicit NullChunk(size_t n, uint32_t align) : size(n) {
+    hasData = false;
+    setAlignment(align);
+  }
+  explicit NullChunk(COFFLinkerContext &ctx)
+      : NullChunk(ctx.config.wordsize, ctx.config.wordsize) {}
+  explicit NullChunk(COFFLinkerContext &ctx, size_t n)
+      : NullChunk(n, ctx.config.wordsize) {}
   size_t getSize() const override { return size; }
 
   void writeTo(uint8_t *buf) const override {
@@ -388,6 +395,7 @@ class TailMergePDataChunkX64 : public NonSectionChunk {
   }
 
   size_t getSize() const override { return 3 * sizeof(uint32_t); }
+  MachineTypes getMachine() const override { return AMD64; }
 
   void writeTo(uint8_t *buf) const override {
     write32le(buf + 0, tm->getRVA()); // TailMergeChunk start RVA
@@ -408,6 +416,7 @@ class TailMergeUnwindInfoX64 : public NonSectionChunk {
   }
 
   size_t getSize() const override { return sizeof(tailMergeUnwindInfoX64); }
+  MachineTypes getMachine() const override { return AMD64; }
 
   void writeTo(uint8_t *buf) const override {
     memcpy(buf, tailMergeUnwindInfoX64, sizeof(tailMergeUnwindInfoX64));
@@ -737,11 +746,11 @@ void IdataContents::create(COFFLinkerContext &ctx) {
       }
     }
     // Terminate with null values.
-    lookups.push_back(make<NullChunk>(ctx.config.wordsize));
-    addresses.push_back(make<NullChunk>(ctx.config.wordsize));
+    lookups.push_back(make<NullChunk>(ctx));
+    addresses.push_back(make<NullChunk>(ctx));
     if (ctx.config.machine == ARM64EC) {
-      auxIat.push_back(make<NullChunk>(ctx.config.wordsize));
-      auxIatCopy.push_back(make<NullChunk>(ctx.config.wordsize));
+      auxIat.push_back(make<NullChunk>(ctx));
+      auxIatCopy.push_back(make<NullChunk>(ctx));
     }
 
     for (int i = 0, e = syms.size(); i < e; ++i)
@@ -755,7 +764,7 @@ void IdataContents::create(COFFLinkerContext &ctx) {
     dirs.push_back(dir);
   }
   // Add null terminator.
-  dirs.push_back(make<NullChunk>(sizeof(ImportDirectoryTableEntry)));
+  dirs.push_back(make<NullChunk>(sizeof(ImportDirectoryTableEntry), 4));
 }
 
 std::vector<Chunk *> DelayLoadContents::getChunks() {
@@ -830,17 +839,16 @@ void DelayLoadContents::create(Defined *h) {
         saver().save("__tailMerge_" + syms[0]->getDLLName().lower());
     ctx.symtab.addSynthetic(tmName, tm);
     // Terminate with null values.
-    addresses.push_back(make<NullChunk>(8));
-    names.push_back(make<NullChunk>(8));
+    addresses.push_back(make<NullChunk>(ctx, 8));
+    names.push_back(make<NullChunk>(ctx, 8));
     if (ctx.config.machine == ARM64EC) {
-      auxIat.push_back(make<NullChunk>(8));
-      auxIatCopy.push_back(make<NullChunk>(8));
+      auxIat.push_back(make<NullChunk>(ctx, 8));
+      auxIatCopy.push_back(make<NullChunk>(ctx, 8));
     }
 
     for (int i = 0, e = syms.size(); i < e; ++i)
       syms[i]->setLocation(addresses[base + i]);
-    auto *mh = make<NullChunk>(8);
-    mh->setAlignment(8);
+    auto *mh = make<NullChunk>(8, 8);
     moduleHandles.push_back(mh);
 
     // Fill the delay import table header fields.
@@ -853,7 +861,8 @@ void DelayLoadContents::create(Defined *h) {
   if (unwind)
     unwindinfo.push_back(unwind);
   // Add null terminator.
-  dirs.push_back(make<NullChunk>(sizeof(delay_import_directory_table_entry)));
+  dirs.push_back(
+      make<NullChunk>(sizeof(delay_import_directory_table_entry), 4));
 }
 
 Chunk *DelayLoadContents::newTailMergeChunk(Chunk *dir) {
@@ -875,6 +884,7 @@ Chunk *DelayLoadContents::newTailMergeChunk(Chunk *dir) {
 Chunk *DelayLoadContents::newTailMergeUnwindInfoChunk() {
   switch (ctx.config.machine) {
   case AMD64:
+  case ARM64EC:
     return make<TailMergeUnwindInfoX64>();
     // FIXME: Add support for other architectures.
   default:
@@ -884,6 +894,7 @@ Chunk *DelayLoadContents::newTailMergeUnwindInfoChunk() {
 Chunk *DelayLoadContents::newTailMergePDataChunk(Chunk *tm, Chunk *unwind) {
   switch (ctx.config.machine) {
   case AMD64:
+  case ARM64EC:
     return make<TailMergePDataChunkX64>(tm, unwind);
     // FIXME: Add support for other architectures.
   default:
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 100db44544681..c23a2f872d918 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -49,15 +49,15 @@ class ARM final : public TargetInfo {
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
 
+  DenseMap<InputSection *, SmallVector<const Defined *, 0>> sectionMap;
+
 private:
-void encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val,
-                           int group, bool check) const;
+  void encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val,
+                      int group, bool check) const;
 };
 enum class CodeState { Data = 0, Thumb = 2, Arm = 4 };
 } // namespace
 
-static DenseMap<InputSection *, SmallVector<const Defined *, 0>> sectionMap{};
-
 ARM::ARM(Ctx &ctx) : TargetInfo(ctx) {
   copyRel = R_ARM_COPY;
   relativeRel = R_ARM_RELATIVE;
@@ -1047,10 +1047,10 @@ static bool isDataMapSymbol(const Symbol *b) {
   return b->getName() == "$d" || b->getName().starts_with("$d.");
 }
 
-void elf::sortArmMappingSymbols() {
+void elf::sortArmMappingSymbols(Ctx &ctx) {
   // For each input section make sure the mapping symbols are sorted in
   // ascending order.
-  for (auto &kv : sectionMap) {
+  for (auto &kv : static_cast<ARM &>(*ctx.target).sectionMap) {
     SmallVector<const Defined *, 0> &mapSyms = kv.second;
     llvm::stable_sort(mapSyms, [](const Defined *a, const Defined *b) {
       return a->value < b->value;
@@ -1063,6 +1063,7 @@ void elf::addArmInputSectionMappingSymbols(Ctx &ctx) {
   // The linker generated mapping symbols for all the synthetic
   // sections are adding into the sectionmap through the function
   // addArmSyntheitcSectionMappingSymbol.
+  auto &sectionMap = static_cast<ARM &>(*ctx.target).sectionMap;
   for (ELFFileBase *file : ctx.objectFiles) {
     for (Symbol *sym : file->getLocalSymbols()) {
       auto *def = dyn_cast<Defined>(sym);
@@ -1088,7 +1089,7 @@ void elf::addArmSyntheticSectionMappingSymbol(Defined *sym) {
     return;
   if (auto *sec = cast_if_present<InputSection>(sym->section))
     if (sec->flags & SHF_EXECINSTR)
-      sectionMap[sec].push_back(sym);
+      static_cast<ARM &>(*sec->file->ctx.target).sectionMap[sec].push_back(sym);
 }
 
 static void toLittleEndianInstructions(uint8_t *buf, uint64_t start,
@@ -1109,7 +1110,9 @@ static void toLittleEndianInstructions(uint8_t *buf, uint64_t start,
 // identify half open intervals of Arm code [$a, non $a) and Thumb code
 // [$t, non $t) and convert these to little endian a word or half word at a
 // time respectively.
-void elf::convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf) {
+void elf::convertArmInstructionstoBE8(Ctx &ctx, InputSection *sec,
+                                      uint8_t *buf) {
+  auto &sectionMap = static_cast<ARM &>(*ctx.target).sectionMap;
   auto it = sectionMap.find(sec);
   if (it == sectionMap.end())
     return;
diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index e7779683c9707..3c6fe0daee6bd 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -328,7 +328,7 @@ void Hexagon::relocate(uint8_t *loc, const Relocation &rel,
   case R_HEX_B22_PCREL:
   case R_HEX_GD_PLT_B22_PCREL:
   case R_HEX_PLT_B22_PCREL:
-    checkInt(ctx, loc, val, 22, rel);
+    checkInt(ctx, loc, val, 24, rel);
     or32le(loc, applyMask(0x1ff3ffe, val >> 2));
     break;
   case R_HEX_B22_PCREL_X:
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index e9267bd4128d1..2dcce5c224d5d 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -394,7 +394,7 @@ RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
   case R_X86_64_GOTPCREL:
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
-  case R_X86_64_REX2_GOTPCRELX:
+  case R_X86_64_CODE_4_GOTPCRELX:
   case R_X86_64_GOTTPOFF:
     return R_GOT_PC;
   case R_X86_64_GOTOFF64:
@@ -738,7 +738,7 @@ int64_t X86_64::getImplicitAddend(const uint8_t *buf, RelType type) const {
   case R_X86_64_GOTPCREL:
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
-  case R_X86_64_REX2_GOTPCRELX:
+  case R_X86_64_CODE_4_GOTPCRELX:
   case R_X86_64_PC32:
   case R_X86_64_GOTTPOFF:
   case R_X86_64_PLT32:
@@ -821,7 +821,7 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     break;
   case R_X86_64_GOTPCRELX:
   case R_X86_64_REX_GOTPCRELX:
-  case R_X86_64_REX2_GOTPCRELX:
+  case R_X86_64_CODE_4_GOTPCRELX:
     if (rel.expr != R_GOT_PC) {
       relaxGot(loc, rel, val);
     } else {
@@ -873,13 +873,13 @@ void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
 
 RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
                                 const uint8_t *loc) const {
-  // Only R_X86_64_[REX_]|[REX2_]GOTPCRELX can be relaxed. GNU as may emit
+  // Only R_X86_64_[REX_]|[CODE_4_]GOTPCRELX can be relaxed. GNU as may emit
   // GOTPCRELX with addend != -4. Such an instruction does not load the full GOT
   // entry, so we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip),
   // %rax (addend=0) loads the high 32 bits of the GOT entry.
   if (!ctx.arg.relax || addend != -4 ||
       (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX &&
-       type != R_X86_64_REX2_GOTPCRELX))
+       type != R_X86_64_CODE_4_GOTPCRELX))
     return R_GOT_PC;
   const uint8_t op = loc[-2];
   const uint8_t modRm = loc[-1];
@@ -1002,7 +1002,8 @@ static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) {
     // We are relaxing a rip relative to an absolute, so compensate
     // for the old -4 addend.
     assert(!rel.sym->file || !rel.sym->file->ctx.arg.isPic);
-    relaxGotNoPic(loc, val + 4, op, modRm, rel.type == R_X86_64_REX2_GOTPCRELX);
+    relaxGotNoPic(loc, val + 4, op, modRm,
+                  rel.type == R_X86_64_CODE_4_GOTPCRELX);
     return;
   }
 
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 3f962e1a3129a..d8aa2c46cfa5b 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -1440,17 +1440,17 @@ void LinkerScript::adjustSectionsAfterSorting() {
 // We check if the headers fit below the first allocated section. If there isn't
 // enough space for these sections, we'll remove them from the PT_LOAD segment,
 // and we'll also remove the PT_PHDR segment.
-void LinkerScript::allocateHeaders(SmallVector<PhdrEntry *, 0> &phdrs) {
+void LinkerScript::allocateHeaders(
+    SmallVector<std::unique_ptr<PhdrEntry>, 0> &phdrs) {
   uint64_t min = std::numeric_limits<uint64_t>::max();
   for (OutputSection *sec : ctx.outputSections)
     if (sec->flags & SHF_ALLOC)
       min = std::min<uint64_t>(min, sec->addr);
 
-  auto it = llvm::find_if(
-      phdrs, [](const PhdrEntry *e) { return e->p_type == PT_LOAD; });
+  auto it = llvm::find_if(phdrs, [](auto &e) { return e->p_type == PT_LOAD; });
   if (it == phdrs.end())
     return;
-  PhdrEntry *firstPTLoad = *it;
+  PhdrEntry *firstPTLoad = it->get();
 
   bool hasExplicitHeaders =
       llvm::any_of(phdrsCommands, [](const PhdrsCommand &cmd) {
@@ -1479,8 +1479,7 @@ void LinkerScript::allocateHeaders(SmallVector<PhdrEntry *, 0> &phdrs) {
   ctx.out.programHeaders->ptLoad = nullptr;
   firstPTLoad->firstSec = findFirstSection(ctx, firstPTLoad);
 
-  llvm::erase_if(phdrs,
-                 [](const PhdrEntry *e) { return e->p_type == PT_PHDR; });
+  llvm::erase_if(phdrs, [](auto &e) { return e->p_type == PT_PHDR; });
 }
 
 LinkerScript::AddressState::AddressState(const LinkerScript &script) {
@@ -1643,13 +1642,14 @@ void LinkerScript::erasePotentialSpillSections() {
 }
 
 // Creates program headers as instructed by PHDRS linker script command.
-SmallVector<PhdrEntry *, 0> LinkerScript::createPhdrs() {
-  SmallVector<PhdrEntry *, 0> ret;
+SmallVector<std::unique_ptr<PhdrEntry>, 0> LinkerScript::createPhdrs() {
+  SmallVector<std::unique_ptr<PhdrEntry>, 0> ret;
 
   // Process PHDRS and FILEHDR keywords because they are not
   // real output sections and cannot be added in the following loop.
   for (const PhdrsCommand &cmd : phdrsCommands) {
-    PhdrEntry *phdr = make<PhdrEntry>(ctx, cmd.type, cmd.flags.value_or(PF_R));
+    auto phdr =
+        std::make_unique<PhdrEntry>(ctx, cmd.type, cmd.flags.value_or(PF_R));
 
     if (cmd.hasFilehdr)
       phdr->add(ctx.out.elfHeader.get());
@@ -1660,7 +1660,7 @@ SmallVector<PhdrEntry *, 0> LinkerScript::createPhdrs() {
       phdr->p_paddr = cmd.lmaExpr().getValue();
       phdr->hasLMA = true;
     }
-    ret.push_back(phdr);
+    ret.push_back(std::move(phdr));
   }
 
   // Add output sections to program headers.
diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h
index 445bb2b1d7d23..f5408b4ba3037 100644
--- a/lld/ELF/LinkerScript.h
+++ b/lld/ELF/LinkerScript.h
@@ -356,14 +356,14 @@ class LinkerScript final {
   void adjustOutputSections();
   void adjustSectionsAfterSorting();
 
-  SmallVector<PhdrEntry *, 0> createPhdrs();
+  SmallVector<std::unique_ptr<PhdrEntry>, 0> createPhdrs();
   bool needsInterpSection();
 
   bool shouldKeep(InputSectionBase *s);
   std::pair<const OutputSection *, const Defined *> assignAddresses();
   bool spillSections();
   void erasePotentialSpillSections();
-  void allocateHeaders(SmallVector<PhdrEntry *, 0> &phdrs);
+  void allocateHeaders(SmallVector<std::unique_ptr<PhdrEntry>, 0> &phdrs);
   void processSectionCommands();
   void processSymbolAssignments();
   void declareSymbols();
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 094524f9b5379..9bcbea250e7db 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -72,6 +72,10 @@ OutputSection::OutputSection(Ctx &ctx, StringRef name, uint32_t type,
                   /*info=*/0, /*link=*/0),
       ctx(ctx) {}
 
+uint64_t OutputSection::getLMA() const {
+  return ptLoad ? addr + ptLoad->lmaOffset : addr;
+}
+
 // We allow sections of types listed below to merged into a
 // single progbits section. This is typically done by linker
 // scripts. Merging nobits and progbits will force disk space
@@ -536,7 +540,7 @@ void OutputSection::writeTo(Ctx &ctx, uint8_t *buf, parallel::TaskGroup &tg) {
       // instructions to little-endian, leaving the data big-endian.
       if (ctx.arg.emachine == EM_ARM && !ctx.arg.isLE && ctx.arg.armBe8 &&
           (flags & SHF_EXECINSTR))
-        convertArmInstructionstoBE8(isec, buf + isec->outSecOff);
+        convertArmInstructionstoBE8(ctx, isec, buf + isec->outSecOff);
 
       // Fill gaps between sections.
       if (nonZeroFiller) {
diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
index f8509d5a7aaab..67191392d1dbe 100644
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -41,7 +41,7 @@ class OutputSection final : public SectionBase {
     return s->kind() == SectionBase::Output;
   }
 
-  uint64_t getLMA() const { return ptLoad ? addr + ptLoad->lmaOffset : addr; }
+  uint64_t getLMA() const;
   template <typename ELFT> void writeHeaderTo(typename ELFT::Shdr *sHdr);
 
   Ctx &ctx;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 1f3c5708f9253..e110adead5ad0 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1907,6 +1907,10 @@ static void forEachInputSectionDescription(
   }
 }
 
+ThunkCreator::ThunkCreator(Ctx &ctx) : ctx(ctx) {}
+
+ThunkCreator::~ThunkCreator() {}
+
 // Thunk Implementation
 //
 // Thunks (sometimes called stubs, veneers or branch islands) are small pieces
@@ -2212,7 +2216,7 @@ static bool isThunkSectionCompatible(InputSection *source,
 
 std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
                                                 Relocation &rel, uint64_t src) {
-  std::vector<Thunk *> *thunkVec = nullptr;
+  SmallVector<std::unique_ptr<Thunk>, 0> *thunkVec = nullptr;
   // Arm and Thumb have a PC Bias of 8 and 4 respectively, this is cancelled
   // out in the relocation addend. We compensate for the PC bias so that
   // an Arm and Thumb relocation to the same destination get the same keyAddend,
@@ -2233,17 +2237,16 @@ std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
     thunkVec = &thunkedSymbols[{rel.sym, keyAddend}];
 
   // Check existing Thunks for Sym to see if they can be reused
-  for (Thunk *t : *thunkVec)
+  for (auto &t : *thunkVec)
     if (isThunkSectionCompatible(isec, t->getThunkTargetSym()->section) &&
         t->isCompatibleWith(*isec, rel) &&
         ctx.target->inBranchRange(rel.type, src,
                                   t->getThunkTargetSym()->getVA(ctx, -pcBias)))
-      return std::make_pair(t, false);
+      return std::make_pair(t.get(), false);
 
   // No existing compatible Thunk in range, create a new one
-  Thunk *t = addThunk(ctx, *isec, rel);
-  thunkVec->push_back(t);
-  return std::make_pair(t, true);
+  thunkVec->push_back(addThunk(ctx, *isec, rel));
+  return std::make_pair(thunkVec->back().get(), true);
 }
 
 std::pair<Thunk *, bool> ThunkCreator::getSyntheticLandingPad(Defined &d,
@@ -2252,7 +2255,7 @@ std::pair<Thunk *, bool> ThunkCreator::getSyntheticLandingPad(Defined &d,
       {{d.section, d.value}, a}, nullptr);
   if (isNew)
     it->second = addLandingPadThunk(ctx, d, a);
-  return {it->second, isNew};
+  return {it->second.get(), isNew};
 }
 
 // Return true if the relocation target is an in range Thunk.
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index f800b5db61e97..041bd48048587 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -163,7 +163,9 @@ class InputSectionDescription;
 
 class ThunkCreator {
 public:
-  ThunkCreator(Ctx &ctx) : ctx(ctx) {}
+  // Thunk may be incomplete. Avoid inline ctor/dtor.
+  ThunkCreator(Ctx &ctx);
+  ~ThunkCreator();
   // Return true if Thunks have been added to OutputSections
   bool createThunks(uint32_t pass, ArrayRef<OutputSection *> outputSections);
 
@@ -199,9 +201,10 @@ class ThunkCreator {
   // original addend, so we cannot fold offset + addend. A nested pair is used
   // because DenseMapInfo is not specialized for std::tuple.
   llvm::DenseMap<std::pair<std::pair<SectionBase *, uint64_t>, int64_t>,
-                 std::vector<Thunk *>>
+                 SmallVector<std::unique_ptr<Thunk>, 0>>
       thunkedSymbolsBySectionAndAddend;
-  llvm::DenseMap<std::pair<Symbol *, int64_t>, std::vector<Thunk *>>
+  llvm::DenseMap<std::pair<Symbol *, int64_t>,
+                 SmallVector<std::unique_ptr<Thunk>, 0>>
       thunkedSymbols;
 
   // Find a Thunk from the Thunks symbol definition, we can use this to find
@@ -220,7 +223,7 @@ class ThunkCreator {
   // to be reached via thunks that use indirect branches. A destination
   // needs at most one landing pad as that can be reused by all callers.
   llvm::DenseMap<std::pair<std::pair<SectionBase *, uint64_t>, int64_t>,
-                 Thunk *>
+                 std::unique_ptr<Thunk>>
       landingPadsBySectionAndAddend;
 
   // All the nonLandingPad thunks that have been created, in order of creation.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 10372248bc873..7e5e713513c47 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -4427,7 +4427,7 @@ void elf::writeEhdr(Ctx &ctx, uint8_t *buf, Partition &part) {
 template <typename ELFT> void elf::writePhdrs(uint8_t *buf, Partition &part) {
   // Write the program header table.
   auto *hBuf = reinterpret_cast<typename ELFT::Phdr *>(buf);
-  for (PhdrEntry *p : part.phdrs) {
+  for (std::unique_ptr<PhdrEntry> &p : part.phdrs) {
     hBuf->p_type = p->p_type;
     hBuf->p_flags = p->p_flags;
     hBuf->p_offset = p->p_offset;
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 9f78bd3a34834..163a4950a0983 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -1439,6 +1439,31 @@ Defined *addSyntheticLocal(Ctx &ctx, StringRef name, uint8_t type,
 
 void addVerneed(Ctx &, Symbol &ss);
 
+// This describes a program header entry.
+// Each contains type, access flags and range of output sections that will be
+// placed in it.
+struct PhdrEntry {
+  PhdrEntry(Ctx &ctx, unsigned type, unsigned flags)
+      : p_align(type == llvm::ELF::PT_LOAD ? ctx.arg.maxPageSize : 0),
+        p_type(type), p_flags(flags) {}
+  void add(OutputSection *sec);
+
+  uint64_t p_paddr = 0;
+  uint64_t p_vaddr = 0;
+  uint64_t p_memsz = 0;
+  uint64_t p_filesz = 0;
+  uint64_t p_offset = 0;
+  uint32_t p_align = 0;
+  uint32_t p_type = 0;
+  uint32_t p_flags = 0;
+
+  OutputSection *firstSec = nullptr;
+  OutputSection *lastSec = nullptr;
+  bool hasLMA = false;
+
+  uint64_t lmaOffset = 0;
+};
+
 // Linker generated per-partition sections.
 struct Partition {
   Ctx &ctx;
@@ -1447,7 +1472,7 @@ struct Partition {
 
   std::unique_ptr<SyntheticSection> elfHeader;
   std::unique_ptr<SyntheticSection> programHeaders;
-  SmallVector<PhdrEntry *, 0> phdrs;
+  SmallVector<std::unique_ptr<PhdrEntry>, 0> phdrs;
 
   std::unique_ptr<ARMExidxSyntheticSection> armExidx;
   std::unique_ptr<BuildIdSection> buildId;
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 2277537a4e357..ce42d3624a8f5 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -246,8 +246,8 @@ void riscvFinalizeRelax(int passes);
 void mergeRISCVAttributesSections(Ctx &);
 void addArmInputSectionMappingSymbols(Ctx &);
 void addArmSyntheticSectionMappingSymbol(Defined *);
-void sortArmMappingSymbols();
-void convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf);
+void sortArmMappingSymbols(Ctx &);
+void convertArmInstructionstoBE8(Ctx &, InputSection *sec, uint8_t *buf);
 void createTaggedSymbols(Ctx &);
 void initSymbolAnchors(Ctx &);
 
diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp
index c9625453b2eb1..4f04c33f0e5c5 100644
--- a/lld/ELF/Thunks.cpp
+++ b/lld/ELF/Thunks.cpp
@@ -1372,15 +1372,16 @@ Thunk::Thunk(Ctx &ctx, Symbol &d, int64_t a)
 
 Thunk::~Thunk() = default;
 
-static Thunk *addThunkAArch64(Ctx &ctx, RelType type, Symbol &s, int64_t a) {
+static std::unique_ptr<Thunk> addThunkAArch64(Ctx &ctx, RelType type, Symbol &s,
+                                              int64_t a) {
   assert(is_contained({R_AARCH64_CALL26, R_AARCH64_JUMP26, R_AARCH64_PLT32},
                       type));
   bool mayNeedLandingPad =
       (ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) &&
       !isAArch64BTILandingPad(ctx, s, a);
   if (ctx.arg.picThunk)
-    return make<AArch64ADRPThunk>(ctx, s, a, mayNeedLandingPad);
-  return make<AArch64ABSLongThunk>(ctx, s, a, mayNeedLandingPad);
+    return std::make_unique<AArch64ADRPThunk>(ctx, s, a, mayNeedLandingPad);
+  return std::make_unique<AArch64ABSLongThunk>(ctx, s, a, mayNeedLandingPad);
 }
 
 // Creates a thunk for long branches or Thumb-ARM interworking.
@@ -1391,7 +1392,8 @@ static Thunk *addThunkAArch64(Ctx &ctx, RelType type, Symbol &s, int64_t a) {
 //
 // TODO: use B for short Thumb->Arm thunks instead of LDR (this doesn't work for
 //       Arm->Thumb, as in Arm state no BX PC trick; it doesn't switch state).
-static Thunk *addThunkArmv4(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) {
+static std::unique_ptr<Thunk> addThunkArmv4(Ctx &ctx, RelType reloc, Symbol &s,
+                                            int64_t a) {
   bool thumb_target = s.getVA(ctx, a) & 1;
 
   switch (reloc) {
@@ -1401,21 +1403,21 @@ static Thunk *addThunkArmv4(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) {
   case R_ARM_CALL:
     if (ctx.arg.picThunk) {
       if (thumb_target)
-        return make<ARMV4PILongBXThunk>(ctx, s, a);
-      return make<ARMV4PILongThunk>(ctx, s, a);
+        return std::make_unique<ARMV4PILongBXThunk>(ctx, s, a);
+      return std::make_unique<ARMV4PILongThunk>(ctx, s, a);
     }
     if (thumb_target)
-      return make<ARMV4ABSLongBXThunk>(ctx, s, a);
-    return make<ARMV5LongLdrPcThunk>(ctx, s, a);
+      return std::make_unique<ARMV4ABSLongBXThunk>(ctx, s, a);
+    return std::make_unique<ARMV5LongLdrPcThunk>(ctx, s, a);
   case R_ARM_THM_CALL:
     if (ctx.arg.picThunk) {
       if (thumb_target)
-        return make<ThumbV4PILongThunk>(ctx, s, a);
-      return make<ThumbV4PILongBXThunk>(ctx, s, a);
+        return std::make_unique<ThumbV4PILongThunk>(ctx, s, a);
+      return std::make_unique<ThumbV4PILongBXThunk>(ctx, s, a);
     }
     if (thumb_target)
-      return make<ThumbV4ABSLongThunk>(ctx, s, a);
-    return make<ThumbV4ABSLongBXThunk>(ctx, s, a);
+      return std::make_unique<ThumbV4ABSLongThunk>(ctx, s, a);
+    return std::make_unique<ThumbV4ABSLongBXThunk>(ctx, s, a);
   }
   Fatal(ctx) << "relocation " << reloc << " to " << &s
              << " not supported for Armv4 or Armv4T target";
@@ -1427,7 +1429,8 @@ static Thunk *addThunkArmv4(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) {
 // - MOVT and MOVW instructions cannot be used
 // - Only Thumb relocation that can generate a Thunk is a BL, this can always
 //   be transformed into a BLX
-static Thunk *addThunkArmv5v6(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) {
+static std::unique_ptr<Thunk> addThunkArmv5v6(Ctx &ctx, RelType reloc,
+                                              Symbol &s, int64_t a) {
   switch (reloc) {
   case R_ARM_PC24:
   case R_ARM_PLT32:
@@ -1435,8 +1438,8 @@ static Thunk *addThunkArmv5v6(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) {
   case R_ARM_CALL:
   case R_ARM_THM_CALL:
     if (ctx.arg.picThunk)
-      return make<ARMV4PILongBXThunk>(ctx, s, a);
-    return make<ARMV5LongLdrPcThunk>(ctx, s, a);
+      return std::make_unique<ARMV4PILongBXThunk>(ctx, s, a);
+    return std::make_unique<ARMV5LongLdrPcThunk>(ctx, s, a);
   }
   Fatal(ctx) << "relocation " << reloc << " to " << &s
              << " not supported for Armv5 or Armv6 targets";
@@ -1448,8 +1451,8 @@ static Thunk *addThunkArmv5v6(Ctx &ctx, RelType reloc, Symbol &s, int64_t a) {
 // - MOVT and MOVW instructions cannot be used.
 // - Only a limited number of instructions can access registers r8 and above
 // - No interworking support is needed (all Thumb).
-static Thunk *addThunkV6M(Ctx &ctx, const InputSection &isec, RelType reloc,
-                          Symbol &s, int64_t a) {
+static std::unique_ptr<Thunk> addThunkV6M(Ctx &ctx, const InputSection &isec,
+                                          RelType reloc, Symbol &s, int64_t a) {
   const bool isPureCode = isec.getParent()->flags & SHF_ARM_PURECODE;
   switch (reloc) {
   case R_ARM_THM_JUMP19:
@@ -1457,7 +1460,7 @@ static Thunk *addThunkV6M(Ctx &ctx, const InputSection &isec, RelType reloc,
   case R_ARM_THM_CALL:
     if (ctx.arg.isPic) {
       if (!isPureCode)
-        return make<ThumbV6MPILongThunk>(ctx, s, a);
+        return std::make_unique<ThumbV6MPILongThunk>(ctx, s, a);
 
       Fatal(ctx)
           << "relocation " << reloc << " to " << &s
@@ -1466,8 +1469,8 @@ static Thunk *addThunkV6M(Ctx &ctx, const InputSection &isec, RelType reloc,
       llvm_unreachable("");
     }
     if (isPureCode)
-      return make<ThumbV6MABSXOLongThunk>(ctx, s, a);
-    return make<ThumbV6MABSLongThunk>(ctx, s, a);
+      return std::make_unique<ThumbV6MABSXOLongThunk>(ctx, s, a);
+    return std::make_unique<ThumbV6MABSLongThunk>(ctx, s, a);
   }
   Fatal(ctx) << "relocation " << reloc << " to " << &s
              << " not supported for Armv6-M targets";
@@ -1475,8 +1478,8 @@ static Thunk *addThunkV6M(Ctx &ctx, const InputSection &isec, RelType reloc,
 }
 
 // Creates a thunk for Thumb-ARM interworking or branch range extension.
-static Thunk *addThunkArm(Ctx &ctx, const InputSection &isec, RelType reloc,
-                          Symbol &s, int64_t a) {
+static std::unique_ptr<Thunk> addThunkArm(Ctx &ctx, const InputSection &isec,
+                                          RelType reloc, Symbol &s, int64_t a) {
   // Decide which Thunk is needed based on:
   // Available instruction set
   // - An Arm Thunk can only be used if Arm state is available.
@@ -1508,47 +1511,49 @@ static Thunk *addThunkArm(Ctx &ctx, const InputSection &isec, RelType reloc,
   case R_ARM_JUMP24:
   case R_ARM_CALL:
     if (ctx.arg.picThunk)
-      return make<ARMV7PILongThunk>(ctx, s, a);
-    return make<ARMV7ABSLongThunk>(ctx, s, a);
+      return std::make_unique<ARMV7PILongThunk>(ctx, s, a);
+    return std::make_unique<ARMV7ABSLongThunk>(ctx, s, a);
   case R_ARM_THM_JUMP19:
   case R_ARM_THM_JUMP24:
   case R_ARM_THM_CALL:
     if (ctx.arg.picThunk)
-      return make<ThumbV7PILongThunk>(ctx, s, a);
-    return make<ThumbV7ABSLongThunk>(ctx, s, a);
+      return std::make_unique<ThumbV7PILongThunk>(ctx, s, a);
+    return std::make_unique<ThumbV7ABSLongThunk>(ctx, s, a);
   }
   llvm_unreachable("");
 }
 
-static Thunk *addThunkAVR(Ctx &ctx, RelType type, Symbol &s, int64_t a) {
+static std::unique_ptr<Thunk> addThunkAVR(Ctx &ctx, RelType type, Symbol &s,
+                                          int64_t a) {
   switch (type) {
   case R_AVR_LO8_LDI_GS:
   case R_AVR_HI8_LDI_GS:
-    return make<AVRThunk>(ctx, s, a);
+    return std::make_unique<AVRThunk>(ctx, s, a);
   default:
     llvm_unreachable("");
   }
 }
 
-static Thunk *addThunkMips(Ctx &ctx, RelType type, Symbol &s) {
+static std::unique_ptr<Thunk> addThunkMips(Ctx &ctx, RelType type, Symbol &s) {
   if ((s.stOther & STO_MIPS_MICROMIPS) && isMipsR6(ctx))
-    return make<MicroMipsR6Thunk>(ctx, s);
+    return std::make_unique<MicroMipsR6Thunk>(ctx, s);
   if (s.stOther & STO_MIPS_MICROMIPS)
-    return make<MicroMipsThunk>(ctx, s);
-  return make<MipsThunk>(ctx, s);
+    return std::make_unique<MicroMipsThunk>(ctx, s);
+  return std::make_unique<MipsThunk>(ctx, s);
 }
 
-static Thunk *addThunkPPC32(Ctx &ctx, const InputSection &isec,
-                            const Relocation &rel, Symbol &s) {
+static std::unique_ptr<Thunk> addThunkPPC32(Ctx &ctx, const InputSection &isec,
+                                            const Relocation &rel, Symbol &s) {
   assert((rel.type == R_PPC_LOCAL24PC || rel.type == R_PPC_REL24 ||
           rel.type == R_PPC_PLTREL24) &&
          "unexpected relocation type for thunk");
   if (s.isInPlt(ctx))
-    return make<PPC32PltCallStub>(ctx, isec, rel, s);
-  return make<PPC32LongThunk>(ctx, s, rel.addend);
+    return std::make_unique<PPC32PltCallStub>(ctx, isec, rel, s);
+  return std::make_unique<PPC32LongThunk>(ctx, s, rel.addend);
 }
 
-static Thunk *addThunkPPC64(Ctx &ctx, RelType type, Symbol &s, int64_t a) {
+static std::unique_ptr<Thunk> addThunkPPC64(Ctx &ctx, RelType type, Symbol &s,
+                                            int64_t a) {
   assert((type == R_PPC64_REL14 || type == R_PPC64_REL24 ||
           type == R_PPC64_REL24_NOTOC) &&
          "unexpected relocation type for thunk");
@@ -1558,27 +1563,30 @@ static Thunk *addThunkPPC64(Ctx &ctx, RelType type, Symbol &s, int64_t a) {
   if (type == R_PPC64_REL24_NOTOC)
     ctx.target->ppc64DynamicSectionOpt = 0x2;
 
-  if (s.isInPlt(ctx))
-    return type == R_PPC64_REL24_NOTOC
-               ? (Thunk *)make<PPC64R12SetupStub>(ctx, s, /*gotPlt=*/true)
-               : (Thunk *)make<PPC64PltCallStub>(ctx, s);
+  if (s.isInPlt(ctx)) {
+    if (type == R_PPC64_REL24_NOTOC)
+      return std::make_unique<PPC64R12SetupStub>(ctx, s,
+                                                 /*gotPlt=*/true);
+    return std::make_unique<PPC64PltCallStub>(ctx, s);
+  }
 
   // This check looks at the st_other bits of the callee. If the value is 1
   // then the callee clobbers the TOC and we need an R2 save stub when RelType
   // is R_PPC64_REL14 or R_PPC64_REL24.
   if ((type == R_PPC64_REL14 || type == R_PPC64_REL24) && (s.stOther >> 5) == 1)
-    return make<PPC64R2SaveStub>(ctx, s, a);
+    return std::make_unique<PPC64R2SaveStub>(ctx, s, a);
 
   if (type == R_PPC64_REL24_NOTOC)
-    return make<PPC64R12SetupStub>(ctx, s, /*gotPlt=*/false);
+    return std::make_unique<PPC64R12SetupStub>(ctx, s, /*gotPlt=*/false);
 
   if (ctx.arg.picThunk)
-    return make<PPC64PILongBranchThunk>(ctx, s, a);
+    return std::make_unique<PPC64PILongBranchThunk>(ctx, s, a);
 
-  return make<PPC64PDLongBranchThunk>(ctx, s, a);
+  return std::make_unique<PPC64PDLongBranchThunk>(ctx, s, a);
 }
 
-Thunk *elf::addThunk(Ctx &ctx, const InputSection &isec, Relocation &rel) {
+std::unique_ptr<Thunk> elf::addThunk(Ctx &ctx, const InputSection &isec,
+                                     Relocation &rel) {
   Symbol &s = *rel.sym;
   int64_t a = rel.addend;
 
@@ -1600,10 +1608,10 @@ Thunk *elf::addThunk(Ctx &ctx, const InputSection &isec, Relocation &rel) {
   }
 }
 
-Thunk *elf::addLandingPadThunk(Ctx &ctx, Symbol &s, int64_t a) {
+std::unique_ptr<Thunk> elf::addLandingPadThunk(Ctx &ctx, Symbol &s, int64_t a) {
   switch (ctx.arg.emachine) {
   case EM_AARCH64:
-    return make<AArch64BTILandingPadThunk>(ctx, s, a);
+    return std::make_unique<AArch64BTILandingPadThunk>(ctx, s, a);
   default:
     llvm_unreachable("add landing pad only supported for AArch64");
   }
diff --git a/lld/ELF/Thunks.h b/lld/ELF/Thunks.h
index 247b500580325..446345b8517f9 100644
--- a/lld/ELF/Thunks.h
+++ b/lld/ELF/Thunks.h
@@ -76,11 +76,12 @@ class Thunk {
 
 // For a Relocation to symbol S create a Thunk to be added to a synthetic
 // ThunkSection.
-Thunk *addThunk(Ctx &, const InputSection &isec, Relocation &rel);
+std::unique_ptr<Thunk> addThunk(Ctx &, const InputSection &isec,
+                                Relocation &rel);
 
 // Create a landing pad Thunk for use when indirect branches from Thunks
 // are restricted.
-Thunk *addLandingPadThunk(Ctx &, Symbol &s, int64_t a);
+std::unique_ptr<Thunk> addLandingPadThunk(Ctx &, Symbol &s, int64_t a);
 
 void writePPC32PltCallStub(Ctx &, uint8_t *buf, uint64_t gotPltVA,
                            const InputFile *file, int64_t addend);
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 5865ead0ff88b..67497bad7cb23 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -49,7 +49,7 @@ template <class ELFT> class Writer {
 public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
-  Writer(Ctx &ctx) : ctx(ctx), buffer(ctx.e.outputBuffer) {}
+  Writer(Ctx &ctx) : ctx(ctx), buffer(ctx.e.outputBuffer), tc(ctx) {}
 
   void run();
 
@@ -65,7 +65,7 @@ template <class ELFT> class Writer {
   void checkExecuteOnly();
   void setReservedSymbolSections();
 
-  SmallVector<PhdrEntry *, 0> createPhdrs(Partition &part);
+  SmallVector<std::unique_ptr<PhdrEntry>, 0> createPhdrs(Partition &part);
   void addPhdrForSection(Partition &part, unsigned shType, unsigned pType,
                          unsigned pFlags);
   void assignFileOffsets();
@@ -82,6 +82,8 @@ template <class ELFT> class Writer {
 
   Ctx &ctx;
   std::unique_ptr<FileOutputBuffer> &buffer;
+  // ThunkCreator holds Thunks that are used at writeTo time.
+  ThunkCreator tc;
 
   void addRelIpltSymbols();
   void addStartEndSymbols();
@@ -96,20 +98,22 @@ template <class ELFT> void elf::writeResult(Ctx &ctx) {
   Writer<ELFT>(ctx).run();
 }
 
-static void removeEmptyPTLoad(Ctx &ctx, SmallVector<PhdrEntry *, 0> &phdrs) {
-  auto it = std::stable_partition(
-      phdrs.begin(), phdrs.end(), [&](const PhdrEntry *p) {
-        if (p->p_type != PT_LOAD)
-          return true;
-        if (!p->firstSec)
-          return false;
-        uint64_t size = p->lastSec->addr + p->lastSec->size - p->firstSec->addr;
-        return size != 0;
-      });
+static void
+removeEmptyPTLoad(Ctx &ctx, SmallVector<std::unique_ptr<PhdrEntry>, 0> &phdrs) {
+  auto it = std::stable_partition(phdrs.begin(), phdrs.end(), [&](auto &p) {
+    if (p->p_type != PT_LOAD)
+      return true;
+    if (!p->firstSec)
+      return false;
+    uint64_t size = p->lastSec->addr + p->lastSec->size - p->firstSec->addr;
+    return size != 0;
+  });
 
   // Clear OutputSection::ptLoad for sections contained in removed
   // segments.
-  DenseSet<PhdrEntry *> removed(it, phdrs.end());
+  DenseSet<PhdrEntry *> removed;
+  for (auto it2 = it; it2 != phdrs.end(); ++it2)
+    removed.insert(it2->get());
   for (OutputSection *sec : ctx.outputSections)
     if (removed.count(sec->ptLoad))
       sec->ptLoad = nullptr;
@@ -853,10 +857,10 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
     return ctx.arg.emachine == EM_X86_64 && osec->flags & SHF_X86_64_LARGE;
   };
   for (Partition &part : ctx.partitions) {
-    for (PhdrEntry *p : part.phdrs) {
+    for (auto &p : part.phdrs) {
       if (p->p_type != PT_LOAD)
         continue;
-      last = p;
+      last = p.get();
       if (!(p->p_flags & PF_W) && p->lastSec && !isLarge(p->lastSec))
         lastRO = p->lastSec;
     }
@@ -1446,7 +1450,6 @@ static void finalizeSynthetic(Ctx &ctx, SyntheticSection *sec) {
 // in Writer<ELFT>::finalizeSections().
 template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
   llvm::TimeTraceScope timeScope("Finalize address dependent content");
-  ThunkCreator tc(ctx);
   AArch64Err843419Patcher a64p(ctx);
   ARMErr657417Patcher a32p(ctx);
   ctx.script->assignAddresses();
@@ -1978,9 +1981,9 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
     // Android relocation packing can look up TLS symbol addresses. We only need
     // to care about the main partition here because all TLS symbols were moved
     // to the main partition (see MarkLive.cpp).
-    for (PhdrEntry *p : ctx.mainPart->phdrs)
+    for (auto &p : ctx.mainPart->phdrs)
       if (p->p_type == PT_TLS)
-        ctx.tlsPhdr = p;
+        ctx.tlsPhdr = p.get();
   }
 
   // Some symbols are defined in term of program headers. Now that we
@@ -2095,7 +2098,7 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
 
   if (ctx.arg.emachine == EM_ARM && !ctx.arg.isLE && ctx.arg.armBe8) {
     addArmInputSectionMappingSymbols(ctx);
-    sortArmMappingSymbols();
+    sortArmMappingSymbols(ctx);
   }
 }
 
@@ -2190,11 +2193,12 @@ static uint64_t computeFlags(Ctx &ctx, uint64_t flags) {
 // Decide which program headers to create and which sections to include in each
 // one.
 template <class ELFT>
-SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
-  SmallVector<PhdrEntry *, 0> ret;
+SmallVector<std::unique_ptr<PhdrEntry>, 0>
+Writer<ELFT>::createPhdrs(Partition &part) {
+  SmallVector<std::unique_ptr<PhdrEntry>, 0> ret;
   auto addHdr = [&, &ctx = ctx](unsigned type, unsigned flags) -> PhdrEntry * {
-    ret.push_back(make<PhdrEntry>(ctx, type, flags));
-    return ret.back();
+    ret.push_back(std::make_unique<PhdrEntry>(ctx, type, flags));
+    return ret.back().get();
   };
 
   unsigned partNo = part.getNumber(ctx);
@@ -2232,7 +2236,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
   // read-only by dynamic linker after processing relocations.
   // Current dynamic loaders only support one PT_GNU_RELRO PHDR, give
   // an error message if more than one PT_GNU_RELRO PHDR is required.
-  PhdrEntry *relRo = make<PhdrEntry>(ctx, PT_GNU_RELRO, PF_R);
+  auto relRo = std::make_unique<PhdrEntry>(ctx, PT_GNU_RELRO, PF_R);
   bool inRelroPhdr = false;
   OutputSection *relroEnd = nullptr;
   for (OutputSection *sec : ctx.outputSections) {
@@ -2309,19 +2313,19 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
   }
 
   // Add a TLS segment if any.
-  PhdrEntry *tlsHdr = make<PhdrEntry>(ctx, PT_TLS, PF_R);
+  auto tlsHdr = std::make_unique<PhdrEntry>(ctx, PT_TLS, PF_R);
   for (OutputSection *sec : ctx.outputSections)
     if (sec->partition == partNo && sec->flags & SHF_TLS)
       tlsHdr->add(sec);
   if (tlsHdr->firstSec)
-    ret.push_back(tlsHdr);
+    ret.push_back(std::move(tlsHdr));
 
   // Add an entry for .dynamic.
   if (OutputSection *sec = part.dynamic->getParent())
     addHdr(PT_DYNAMIC, sec->getPhdrFlags())->add(sec);
 
   if (relRo->firstSec)
-    ret.push_back(relRo);
+    ret.push_back(std::move(relRo));
 
   // PT_GNU_EH_FRAME is a special section pointing on .eh_frame_hdr.
   if (part.ehFrame->isNeeded() && part.ehFrameHdr &&
@@ -2394,9 +2398,9 @@ void Writer<ELFT>::addPhdrForSection(Partition &part, unsigned shType,
   if (i == ctx.outputSections.end())
     return;
 
-  PhdrEntry *entry = make<PhdrEntry>(ctx, pType, pFlags);
+  auto entry = std::make_unique<PhdrEntry>(ctx, pType, pFlags);
   entry->add(*i);
-  part.phdrs.push_back(entry);
+  part.phdrs.push_back(std::move(entry));
 }
 
 // Place the first section of each PT_LOAD to a different page (of maxPageSize).
@@ -2459,10 +2463,10 @@ template <class ELFT> void Writer<ELFT>::fixSectionAlignments() {
 
   for (Partition &part : ctx.partitions) {
     prev = nullptr;
-    for (const PhdrEntry *p : part.phdrs)
+    for (auto &p : part.phdrs)
       if (p->p_type == PT_LOAD && p->firstSec) {
-        pageAlign(p);
-        prev = p;
+        pageAlign(p.get());
+        prev = p.get();
       }
   }
 }
@@ -2524,9 +2528,9 @@ template <class ELFT> void Writer<ELFT>::assignFileOffsets() {
 
   PhdrEntry *lastRX = nullptr;
   for (Partition &part : ctx.partitions)
-    for (PhdrEntry *p : part.phdrs)
+    for (auto &p : part.phdrs)
       if (p->p_type == PT_LOAD && (p->p_flags & PF_X))
-        lastRX = p;
+        lastRX = p.get();
 
   // Layout SHF_ALLOC sections before non-SHF_ALLOC sections. A non-SHF_ALLOC
   // will not occupy file offsets contained by a PT_LOAD.
@@ -2579,7 +2583,7 @@ template <class ELFT> void Writer<ELFT>::assignFileOffsets() {
 // Finalize the program headers. We call this function after we assign
 // file offsets and VAs to all sections.
 template <class ELFT> void Writer<ELFT>::setPhdrs(Partition &part) {
-  for (PhdrEntry *p : part.phdrs) {
+  for (std::unique_ptr<PhdrEntry> &p : part.phdrs) {
     OutputSection *first = p->firstSec;
     OutputSection *last = p->lastSec;
 
@@ -2838,7 +2842,7 @@ static void fillTrap(std::array<uint8_t, 4> trapInstr, uint8_t *i,
 template <class ELFT> void Writer<ELFT>::writeTrapInstr() {
   for (Partition &part : ctx.partitions) {
     // Fill the last page.
-    for (PhdrEntry *p : part.phdrs)
+    for (std::unique_ptr<PhdrEntry> &p : part.phdrs)
       if (p->p_type == PT_LOAD && (p->p_flags & PF_X))
         fillTrap(
             ctx.target->trapInstr,
@@ -2850,9 +2854,9 @@ template <class ELFT> void Writer<ELFT>::writeTrapInstr() {
     // an executable segment to ensure that other tools don't accidentally
     // trim the instruction padding (e.g. when stripping the file).
     PhdrEntry *last = nullptr;
-    for (PhdrEntry *p : part.phdrs)
+    for (std::unique_ptr<PhdrEntry> &p : part.phdrs)
       if (p->p_type == PT_LOAD)
-        last = p;
+        last = p.get();
 
     if (last && (last->p_flags & PF_X))
       last->p_memsz = last->p_filesz =
diff --git a/lld/ELF/Writer.h b/lld/ELF/Writer.h
index bd6efe9cde4a5..b1072f61f7250 100644
--- a/lld/ELF/Writer.h
+++ b/lld/ELF/Writer.h
@@ -10,40 +10,12 @@
 #define LLD_ELF_WRITER_H
 
 #include "Config.h"
-#include "llvm/ADT/StringRef.h"
-#include <cstdint>
 
 namespace lld::elf {
-class InputFile;
 class OutputSection;
 void copySectionsIntoPartitions(Ctx &ctx);
 template <class ELFT> void writeResult(Ctx &ctx);
 
-// This describes a program header entry.
-// Each contains type, access flags and range of output sections that will be
-// placed in it.
-struct PhdrEntry {
-  PhdrEntry(Ctx &ctx, unsigned type, unsigned flags)
-      : p_align(type == llvm::ELF::PT_LOAD ? ctx.arg.maxPageSize : 0),
-        p_type(type), p_flags(flags) {}
-  void add(OutputSection *sec);
-
-  uint64_t p_paddr = 0;
-  uint64_t p_vaddr = 0;
-  uint64_t p_memsz = 0;
-  uint64_t p_filesz = 0;
-  uint64_t p_offset = 0;
-  uint32_t p_align = 0;
-  uint32_t p_type = 0;
-  uint32_t p_flags = 0;
-
-  OutputSection *firstSec = nullptr;
-  OutputSection *lastSec = nullptr;
-  bool hasLMA = false;
-
-  uint64_t lmaOffset = 0;
-};
-
 void addReservedSymbols(Ctx &ctx);
 bool includeInSymtab(Ctx &, const Symbol &);
 unsigned getSectionRank(Ctx &, OutputSection &osec);
diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index 195a8f09f47c1..882873ae5de0c 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -44,6 +44,7 @@ struct ARM64 : ARM64Common {
 
   void initICFSafeThunkBody(InputSection *thunk,
                             InputSection *branchTarget) const override;
+  InputSection *getThunkBranchTarget(InputSection *thunk) const override;
   uint32_t getICFSafeThunkSize() const override;
 };
 
@@ -197,6 +198,16 @@ void ARM64::initICFSafeThunkBody(InputSection *thunk,
                              /*referent=*/branchTarget);
 }
 
+InputSection *ARM64::getThunkBranchTarget(InputSection *thunk) const {
+  assert(thunk->relocs.size() == 1 &&
+         "expected a single reloc on ARM64 ICF thunk");
+  auto &reloc = thunk->relocs[0];
+  assert(reloc.referent.is<InputSection *>() &&
+         "ARM64 thunk reloc is expected to point to an InputSection");
+
+  return reloc.referent.dyn_cast<InputSection *>();
+}
+
 uint32_t ARM64::getICFSafeThunkSize() const { return sizeof(icfSafeThunkCode); }
 
 ARM64::ARM64() : ARM64Common(LP64()) {
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 8f6da6330d7ad..41bcd58acc27f 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -164,6 +164,7 @@ struct Configuration {
   llvm::StringRef finalOutput;
 
   llvm::StringRef installName;
+  llvm::StringRef clientName;
   llvm::StringRef mapFile;
   llvm::StringRef ltoObjPath;
   llvm::StringRef thinLTOJobs;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index be0ee7ad8dff9..53b4372435ab5 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1881,6 +1881,15 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     config->installName = config->finalOutput;
   }
 
+  auto getClientName = [&]() {
+    StringRef cn = path::filename(config->finalOutput);
+    cn.consume_front("lib");
+    auto firstDotOrUnderscore = cn.find_first_of("._");
+    cn = cn.take_front(firstDotOrUnderscore);
+    return cn;
+  };
+  config->clientName = args.getLastArgValue(OPT_client_name, getClientName());
+
   if (args.hasArg(OPT_mark_dead_strippable_dylib)) {
     if (config->outputType != MH_DYLIB)
       warn("-mark_dead_strippable_dylib: ignored, only has effect with -dylib");
diff --git a/lld/MachO/DriverUtils.cpp b/lld/MachO/DriverUtils.cpp
index 858a4bb34029c..308c5eaf8c317 100644
--- a/lld/MachO/DriverUtils.cpp
+++ b/lld/MachO/DriverUtils.cpp
@@ -268,6 +268,26 @@ DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
     if (newFile->exportingFile)
       newFile->parseLoadCommands(mbref);
   }
+
+  if (explicitlyLinked && !newFile->allowableClients.empty()) {
+    bool allowed = std::any_of(
+        newFile->allowableClients.begin(), newFile->allowableClients.end(),
+        [&](StringRef allowableClient) {
+          // We only do a prefix match to match LD64's behaviour.
+          return allowableClient.starts_with(config->clientName);
+        });
+
+    // TODO: This behaviour doesn't quite match the latest available source
+    // release of LD64 (ld64-951.9), which allows "parents" and "siblings"
+    // to link to libraries even when they're not explicitly named as
+    // allowable clients. However, behaviour around this seems to have
+    // changed in the latest release of Xcode (ld64-1115.7.3), so it's not
+    // clear what the correct thing to do is yet.
+    if (!allowed)
+      error("cannot link directly with '" +
+            sys::path::filename(newFile->installName) + "' because " +
+            config->clientName + " is not an allowed client");
+  }
   return newFile;
 }
 
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index aedaecfdeb2c0..32dd44ab729e6 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -481,6 +481,33 @@ void macho::markAddrSigSymbols() {
   }
 }
 
+// Given a symbol that was folded into a thunk, return the symbol pointing to
+// the actual body of the function. We use this approach rather than storing the
+// needed info in the Defined itself in order to minimize memory usage.
+Defined *macho::getBodyForThunkFoldedSym(Defined *foldedSym) {
+  assert(isa<ConcatInputSection>(foldedSym->originalIsec) &&
+         "thunk-folded ICF symbol expected to be on a ConcatInputSection");
+  // foldedSec is the InputSection that was marked as deleted upon fold
+  ConcatInputSection *foldedSec =
+      cast<ConcatInputSection>(foldedSym->originalIsec);
+
+  // thunkBody is the actual live thunk, containing the code that branches to
+  // the actual body of the function.
+  InputSection *thunkBody = foldedSec->replacement;
+
+  // The actual (merged) body of the function that the thunk jumps to. This will
+  // end up in the final binary.
+  InputSection *functionBody = target->getThunkBranchTarget(thunkBody);
+
+  for (Symbol *sym : functionBody->symbols) {
+    Defined *d = dyn_cast<Defined>(sym);
+    // The symbol needs to be at the start of the InputSection
+    if (d && d->value == 0)
+      return d;
+  }
+
+  llvm_unreachable("could not find body symbol for ICF-generated thunk");
+}
 void macho::foldIdenticalSections(bool onlyCfStrings) {
   TimeTraceScope timeScope("Fold Identical Code Sections");
   // The ICF equivalence-class segregation algorithm relies on pre-computed
diff --git a/lld/MachO/ICF.h b/lld/MachO/ICF.h
index 34ceb1cf284bf..e382fd6c60956 100644
--- a/lld/MachO/ICF.h
+++ b/lld/MachO/ICF.h
@@ -15,11 +15,17 @@
 
 namespace lld::macho {
 class Symbol;
+class Defined;
 
 void markAddrSigSymbols();
 void markSymAsAddrSig(Symbol *s);
 void foldIdenticalSections(bool onlyCfStrings);
 
+// Given a symbol that was folded into a thunk, return the symbol pointing to
+// the actual body of the function. We expose this function to allow getting the
+// main function body for a symbol that was folded via a thunk.
+Defined *getBodyForThunkFoldedSym(Defined *foldedSym);
+
 } // namespace lld::macho
 
 #endif
diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index 3086c9cc4729d..c3f7c434ffca1 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -1730,6 +1730,14 @@ DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella,
                       ? this
                       : this->umbrella;
 
+  if (!canBeImplicitlyLinked) {
+    for (auto *cmd : findCommands<sub_client_command>(hdr, LC_SUB_CLIENT)) {
+      StringRef allowableClient{reinterpret_cast<const char *>(cmd) +
+                                cmd->client};
+      allowableClients.push_back(allowableClient);
+    }
+  }
+
   const auto *dyldInfo = findCommand<dyld_info_command>(hdr, LC_DYLD_INFO_ONLY);
   const auto *exportsTrie =
       findCommand<linkedit_data_command>(hdr, LC_DYLD_EXPORTS_TRIE);
@@ -1891,6 +1899,12 @@ DylibFile::DylibFile(const InterfaceFile &interface, DylibFile *umbrella,
   exportingFile = (canBeImplicitlyLinked && isImplicitlyLinked(installName))
                       ? this
                       : umbrella;
+
+  if (!canBeImplicitlyLinked)
+    for (const auto &allowableClient : interface.allowableClients())
+      allowableClients.push_back(
+          *make<std::string>(allowableClient.getInstallName().data()));
+
   auto addSymbol = [&](const llvm::MachO::Symbol &symbol,
                        const Twine &name) -> void {
     StringRef savedName = saver().save(name);
diff --git a/lld/MachO/InputFiles.h b/lld/MachO/InputFiles.h
index 5e550c167c232..bc8c8038a39d1 100644
--- a/lld/MachO/InputFiles.h
+++ b/lld/MachO/InputFiles.h
@@ -241,6 +241,7 @@ class DylibFile final : public InputFile {
   DylibFile *exportingFile = nullptr;
   DylibFile *umbrella;
   SmallVector<StringRef, 2> rpaths;
+  SmallVector<StringRef> allowableClients;
   uint32_t compatibilityVersion = 0;
   uint32_t currentVersion = 0;
   int64_t ordinal = 0; // Ordinal numbering starts from 1, so 0 is a sentinel
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 70eb7c8b9e466..739d1da15d466 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -875,8 +875,7 @@ def allowable_client : Separate<["-"], "allowable_client">,
     Group<grp_rare>;
 def client_name : Separate<["-"], "client_name">,
     MetaVarName<"<name>">,
-    HelpText<"Specifies a <name> this client should match with the -allowable_client <name> in a dependent dylib">,
-    Flags<[HelpHidden]>,
+    HelpText<"Specifies a <name> this client should match with the -allowable_client <name> in an explicitly linked dylib">,
     Group<grp_rare>;
 def umbrella : Separate<["-"], "umbrella">,
     MetaVarName<"<name>">,
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index eee87b3a6cb4d..24844c2f3a1eb 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -10,6 +10,7 @@
 #include "ConcatOutputSection.h"
 #include "Config.h"
 #include "ExportTrie.h"
+#include "ICF.h"
 #include "InputFiles.h"
 #include "MachOStructs.h"
 #include "ObjC.h"
@@ -1204,6 +1205,18 @@ void SymtabSection::emitEndFunStab(Defined *defined) {
   stabs.emplace_back(std::move(stab));
 }
 
+// Given a pointer to a function symbol, return the symbol that points to the
+// actual function body that will go in the final binary. Generally this is the
+// symbol itself, but if the symbol was folded using a thunk, we retrieve the
+// target function body from the thunk.
+Defined *SymtabSection::getFuncBodySym(Defined *originalSym) {
+  if (originalSym->identicalCodeFoldingKind == Symbol::ICFFoldKind::None ||
+      originalSym->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body)
+    return originalSym;
+
+  return macho::getBodyForThunkFoldedSym(originalSym);
+}
+
 void SymtabSection::emitStabs() {
   if (config->omitDebugInfo)
     return;
@@ -1229,20 +1242,10 @@ void SymtabSection::emitStabs() {
       if (defined->isAbsolute())
         continue;
 
-      // Never generate a STABS entry for a symbol that has been ICF'ed using a
-      // thunk, just as we do for fully ICF'ed functions. Otherwise, we end up
-      // generating invalid DWARF as dsymutil will assume the entire function
-      // body is at that location, when, in reality, only the thunk is
-      // present. This will end up causing overlapping DWARF entries.
-      // TODO: Find an implementation that works in combination with
-      // `--keep-icf-stabs`.
-      if (defined->identicalCodeFoldingKind == Symbol::ICFFoldKind::Thunk)
-        continue;
-
       // Constant-folded symbols go in the executable's symbol table, but don't
-      // get a stabs entry unless --keep-icf-stabs flag is specified
+      // get a stabs entry unless --keep-icf-stabs flag is specified.
       if (!config->keepICFStabs &&
-          defined->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body)
+          defined->identicalCodeFoldingKind != Symbol::ICFFoldKind::None)
         continue;
 
       ObjFile *file = defined->getObjectFile();
@@ -1251,8 +1254,8 @@ void SymtabSection::emitStabs() {
 
       // We use 'originalIsec' to get the file id of the symbol since 'isec()'
       // might point to the merged ICF symbol's file
-      symbolsNeedingStabs.emplace_back(defined,
-                                       defined->originalIsec->getFile()->id);
+      symbolsNeedingStabs.emplace_back(
+          defined, getFuncBodySym(defined)->originalIsec->getFile()->id);
     }
   }
 
@@ -1269,7 +1272,8 @@ void SymtabSection::emitStabs() {
     Defined *defined = pair.first;
     // We use 'originalIsec' of the symbol since we care about the actual origin
     // of the symbol, not the canonical location returned by `isec()`.
-    InputSection *isec = defined->originalIsec;
+    Defined *funcBodySym = getFuncBodySym(defined);
+    InputSection *isec = funcBodySym->originalIsec;
     ObjFile *file = cast<ObjFile>(isec->getFile());
 
     if (lastFile == nullptr || lastFile != file) {
@@ -1284,12 +1288,12 @@ void SymtabSection::emitStabs() {
     StabsEntry symStab;
     symStab.sect = isec->parent->index;
     symStab.strx = stringTableSection.addString(defined->getName());
-    symStab.value = defined->getVA();
+    symStab.value = funcBodySym->getVA();
 
     if (isCodeSection(isec)) {
       symStab.type = N_FUN;
       stabs.emplace_back(std::move(symStab));
-      emitEndFunStab(defined);
+      emitEndFunStab(funcBodySym);
     } else {
       symStab.type = defined->isExternal() ? N_GSYM : N_STSYM;
       stabs.emplace_back(std::move(symStab));
diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index a4c7f58481aa1..af99f22788d6e 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -485,6 +485,7 @@ class SymtabSection : public LinkEditSection {
   void emitEndSourceStab();
   void emitObjectFileStab(ObjFile *);
   void emitEndFunStab(Defined *);
+  Defined *getFuncBodySym(Defined *);
   void emitStabs();
 
 protected:
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index eaa0336e70cb6..b5b80e083a6c3 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -80,6 +80,12 @@ class TargetInfo {
     llvm_unreachable("target does not support ICF safe thunks");
   }
 
+  // Given a thunk for which `initICFSafeThunkBody` was called, return the
+  // branchTarget it was initialized with.
+  virtual InputSection *getThunkBranchTarget(InputSection *thunk) const {
+    llvm_unreachable("target does not support ICF safe thunks");
+  }
+
   virtual uint32_t getICFSafeThunkSize() const {
     llvm_unreachable("target does not support ICF safe thunks");
   }
diff --git a/lld/test/COFF/arm64ec-delayimport.test b/lld/test/COFF/arm64ec-delayimport.test
index 0c8009362f80e..1e0bd899ba323 100644
--- a/lld/test/COFF/arm64ec-delayimport.test
+++ b/lld/test/COFF/arm64ec-delayimport.test
@@ -12,9 +12,9 @@ RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj
 RUN:          helper-mangled.obj test-arm64ec.lib test2-arm64ec.lib -delayload:test.dll -map
 
 RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s
-TESTSEC:      0x180009000 00600000 88700000 00200000 10100000
-TESTSEC-NEXT: 0x180009010 08600000 90700000 10200000 30100000
-TESTSEC-NEXT: 0x180009020 1c100000 3c100000 00300000
+TESTSEC:      0x18000a000 00600000 88700000 00200000 10100000
+TESTSEC-NEXT: 0x18000a010 08600000 90700000 10200000 30100000
+TESTSEC-NEXT: 0x18000a020 1c100000 3c100000 00300000
 
 RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
 DISASM:      0000000180001000 <.text>:
@@ -61,7 +61,7 @@ DISASM-NEXT: 18000203d: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
 DISASM-NEXT: 180002043: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
 DISASM-NEXT: 180002049: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
 DISASM-NEXT: 18000204f: 48 8b d0                     movq    %rax, %rdx
-DISASM-NEXT: 180002052: 48 8d 0d 97 21 00 00         leaq    0x2197(%rip), %rcx      # 0x1800041f0
+DISASM-NEXT: 180002052: 48 8d 0d a7 21 00 00         leaq    0x21a7(%rip), %rcx      # 0x180004200
 DISASM-NEXT: 180002059: e8 aa ef ff ff               callq   0x180001008 <.text+0x8>
 DISASM-NEXT: 18000205e: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
 DISASM-NEXT: 180002063: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
@@ -85,7 +85,7 @@ IMPORTS-NEXT:   Name: test.dll
 IMPORTS-NEXT:   Attributes: 0x1
 IMPORTS-NEXT:   ModuleHandle: 0x7080
 IMPORTS-NEXT:   ImportAddressTable: 0x7088
-IMPORTS-NEXT:   ImportNameTable: 0x4230
+IMPORTS-NEXT:   ImportNameTable: 0x4240
 IMPORTS-NEXT:   BoundDelayImportTable: 0x0
 IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
 IMPORTS-NEXT:   Import {
@@ -140,6 +140,9 @@ RELOC-NEXT:     Type: DIR64
 RELOC-NEXT:     Address: 0x6008
 RELOC-NEXT:   }
 
+RUN: llvm-readobj --hex-dump=.pdata out.dll | FileCheck --check-prefix=PDATA %s
+PDATA: 0x180008000 2e200000 81200000 18400000
+
 Verify that a demangled version of __delayLoadHelper2 can be used.
 
 RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj test.obj \
diff --git a/lld/test/ELF/aarch64-thunk-bti-multipass.s b/lld/test/ELF/aarch64-thunk-bti-multipass.s
new file mode 100644
index 0000000000000..6569d6d00ec37
--- /dev/null
+++ b/lld/test/ELF/aarch64-thunk-bti-multipass.s
@@ -0,0 +1,113 @@
+// REQUIRES: aarch64
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: llvm-mc -filetype=obj -triple=aarch64 asm -o a.o
+// RUN: ld.lld --script=lds a.o -o out
+// RUN: llvm-objdump -d --no-show-raw-insn out | FileCheck %s
+
+/// Test that a thunk that at creation time does not need to use a BTI
+/// compatible landing pad, but due to other thunk insertion ends up
+/// out of short-branch range so a BTI thunk is required after all.
+
+//--- asm
+.section ".note.gnu.property", "a"
+.p2align 3
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+/// Enable BTI.
+.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND.
+.long 4
+.long 1          // GNU_PROPERTY_AARCH64_FEATURE_1_BTI.
+.long 0
+
+.section .text.0, "ax", %progbits
+.balign 0x1000
+.global _start
+.type _start, %function
+_start:
+/// Call that requires a thunk.
+ bl fn1
+/// padding so that the thunk for fn1 is placed after this section is
+/// sufficiently close to the target to be within short range, but only
+/// just so that a small displacement will mean a long thunk is needed.
+ .space 0x1000
+/// Thunk for call to fn1 will be placed here. Initially it is in short Thunk
+/// range of fn1, but due to a thunk added after a later section it won't be
+/// and will need a long branch thunk, which in turn needs a BTI landing pad.
+
+// CHECK-LABEL: <_start>:
+// CHECK-NEXT: 10001000: bl  0x10002004 <__AArch64AbsLongThunk_fn1>
+
+/// FIXME, the 2nd ldr and udf are a result of mapping symbols being generated
+/// on Thunk insertion. When that is fixed in lld they will be data statements
+/// like in __AArch64AbsLongThunk_far below.
+// CHECK-LABEL: <__AArch64AbsLongThunk_fn1>:
+// CHECK-NEXT: 10002004: ldr     x16, 0x1000200c <__AArch64AbsLongThunk_fn1+0x8>
+// CHECK-NEXT:           br      x16
+// CHECK-NEXT:           ldr     w0, 0x1000260c <__AArch64AbsLongThunk_fn1+0x608>
+// CHECK-NEXT:           udf     #0x0
+
+
+.section .text.1, "ax", %progbits
+.balign 0x1000
+.global farcall
+.type farcall, %function
+farcall:
+/// Call that requires a thunk.
+ bl far
+/// Section is aligned to 0x1000 boundary with size multipe of 0x1000.
+.space 0x1000 - (. - farcall)
+/// Thunk for call to far will be placed here. This will force text.2
+/// on to the next alignment boundary, moving it further away from the
+/// thunk inserted in the .text_low output section.
+
+// CHECK-LABEL: <farcall>:
+// CHECK-NEXT: 18001000: bl      0x18002000 <__AArch64AbsLongThunk_far>
+
+// CHECK-LABEL: <__AArch64AbsLongThunk_far>:
+// CHECK-NEXT: 18002000: ldr     x16, 0x18002008 <__AArch64AbsLongThunk_far+0x8>
+// CHECK-NEXT:           br      x16
+// CHECK-NEXT:           00 00 00 30   .word   0x30000000
+// CHECK-NEXT:           00 00 00 00   .word   0x00000000
+
+.section .text.2, "ax", %progbits
+.balign 0x1000
+.global fn1
+.type fn1, %function
+fn1:
+ ret
+
+.section .text.far, "ax", %progbits
+.type far, %function
+.global far
+far:
+ ret
+
+// CHECK-LABEL: <__AArch64BTIThunk_fn1>:
+// CHECK-NEXT: 18003000: bti     c
+// CHECK-NExT:           b       0x18004000 <fn1>
+
+// CHECK-LABEL: <fn1>:
+// CHECK-NEXT: 18004000: ret
+
+// CHECK-LABEL: <__AArch64BTIThunk_far>:
+// CHECK-NEXT: 30000000: bti     c
+
+// CHECK-LABEL: <far>:
+// CHECK-NEXT: 30000004: ret
+
+//--- lds
+PHDRS {
+  low PT_LOAD FLAGS(0x1 | 0x4);
+  mid PT_LOAD FLAGS(0x1 | 0x4);
+  high PT_LOAD FLAGS(0x1 | 0x4);
+}
+SECTIONS {
+  .rodata 0x10000000 : { *(.note.gnu.property) } :low
+  .text_low : { *(.text.0) } :low
+  .text 0x18001000 : { *(.text.1) } :mid
+  .text_aligned : { *(.text.2) } :mid
+  .text_high 0x30000000 : { *(.text.far) } :high
+}
diff --git a/lld/test/ELF/aarch64-thunk-bti.s b/lld/test/ELF/aarch64-thunk-bti.s
index a16e1569f358e..a447fe4ee9274 100644
--- a/lld/test/ELF/aarch64-thunk-bti.s
+++ b/lld/test/ELF/aarch64-thunk-bti.s
@@ -1,7 +1,7 @@
 // REQUIRES: aarch64
 // RUN: rm -rf %t && split-file %s %t && cd %t
 // RUN: llvm-mc -filetype=obj -triple=aarch64 asm -o a.o
-// RUN: ld.lld --threads=1 --shared --script=lds a.o -o out.so --defsym absolute=0xf0000000
+// RUN: ld.lld --shared --script=lds a.o -o out.so --defsym absolute=0xf0000000
 // RUN: llvm-objdump -d --no-show-raw-insn out.so | FileCheck %s
 // RUN: llvm-objdump -d --no-show-raw-insn out.so | FileCheck %s --check-prefix=CHECK-PADS
 // RUN: llvm-mc -filetype=obj -triple=aarch64 shared -o shared.o
diff --git a/lld/test/ELF/hexagon-jump-error.s b/lld/test/ELF/hexagon-jump-error.s
index fec873827e573..53860b5daf2b1 100644
--- a/lld/test/ELF/hexagon-jump-error.s
+++ b/lld/test/ELF/hexagon-jump-error.s
@@ -25,7 +25,7 @@ if (p0) jump #1f
 .section b15, "ax"
 1:
 
-# CHECK: relocation R_HEX_B22_PCREL out of range: 8388612 is not in [-2097152, 2097151]
+# CHECK: relocation R_HEX_B22_PCREL out of range: 8388612 is not in [-8388608, 8388607]
 jump #1f
 .space (1<<23)
 .section b22, "ax"
diff --git a/lld/test/ELF/hexagon.s b/lld/test/ELF/hexagon.s
index 8ef9b8eead8f1..b1576fb47d81a 100644
--- a/lld/test/ELF/hexagon.s
+++ b/lld/test/ELF/hexagon.s
@@ -1,7 +1,9 @@
 # REQUIRES: hexagon
 # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
 # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %S/Inputs/hexagon.s -o %t1.o
-# RUN: ld.lld %t.o %t1.o -o %t
+# RUN: ld.lld %t.o %t1.o -o %t --Ttext=0x200b4 --section-start=b_1000000=0x1000000 \
+# RUN:  --section-start=b_1000400=0x1000400 --section-start=b_1004000=0x1004000 \
+# RUN:  --section-start=b_1010000=0x1010000 --section-start=b_1800000=0x1800000
 # RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
 
 # Note: 131584 == 0x20200
@@ -221,3 +223,40 @@ r0 = memw(r1+##_start)
 
 memw(r0+##_start) = r1
 # CHECK: memw(r0+##131644) = r1
+
+
+## Tests for maximum branch ranges reachable without trampolines.
+
+.section b_1000000, "ax"
+## The nop makes sure the first jump is within range.
+nop
+{ r0 = #0; jump #b_1000400 } // R_HEX_B9_PCREL
+if (r0==#0) jump:t #b_1004000 // R_HEX_B13_PCREL
+if (p0) jump #b_1010000 // R_HEX_B15_PCREL
+jump #b_1800000 // R_HEX_B22_PCREL
+
+.section b_1000400, "ax"
+nop
+
+.section b_1004000, "ax"
+nop
+
+.section b_1010000, "ax"
+nop
+
+.section b_1800000, "ax"
+nop
+
+## Make sure we got the right relocations.
+# RUN: llvm-readelf -r %t.o | FileCheck %s --check-prefix=REL
+# REL: R_HEX_B9_PCREL         00000000   b_1000400
+# REL: R_HEX_B13_PCREL        00000000   b_1004000
+# REL: R_HEX_B15_PCREL        00000000   b_1010000
+# REL: R_HEX_B22_PCREL        00000000   b_1800000
+
+# CHECK: 01000000 <b_1000000>:
+# CHECK-NEXT: 1000000: {{.*}} {  nop }
+# CHECK-NEXT: 1000004: {{.*}} {  r0 = #0 ; jump 0x1000400 }
+# CHECK-NEXT: 1000008: {{.*}} {  if (r0==#0) jump:t 0x1004000 }
+# CHECK-NEXT: 100000c: {{.*}} {  if (p0) jump:nt 0x1010000 }
+# CHECK-NEXT: 1000010: {{.*}} {  jump 0x1800000 }
diff --git a/lld/test/ELF/x86-64-gotpc-no-relax-err.s b/lld/test/ELF/x86-64-gotpc-no-relax-err.s
index 4280c8fd1dc97..8452090e2c35a 100644
--- a/lld/test/ELF/x86-64-gotpc-no-relax-err.s
+++ b/lld/test/ELF/x86-64-gotpc-no-relax-err.s
@@ -13,7 +13,7 @@
 # CHECK-NEXT: error: {{.*}}:(.text+0x9): relocation R_X86_64_REX_GOTPCRELX out of range: 2147483659 is not in [-2147483648, 2147483647]; references '__stop_data'
 # CHECK-NEXT: >>> defined in <internal>
 # CHECK-EMPTY:
-# CHECK-NEXT: error: {{.*}}:(.text+0x11): relocation R_X86_64_REX2_GOTPCRELX out of range: 2147483651 is not in [-2147483648, 2147483647]; references '__stop_data'
+# CHECK-NEXT: error: {{.*}}:(.text+0x11): relocation R_X86_64_CODE_4_GOTPCRELX out of range: 2147483651 is not in [-2147483648, 2147483647]; references '__stop_data'
 # CHECK-NEXT: >>> defined in <internal>
 
 #--- a.s
diff --git a/lld/test/ELF/x86-64-gotpc-relax-nopic.s b/lld/test/ELF/x86-64-gotpc-relax-nopic.s
index e3cd93d1d5796..be55c7d7006fe 100644
--- a/lld/test/ELF/x86-64-gotpc-relax-nopic.s
+++ b/lld/test/ELF/x86-64-gotpc-relax-nopic.s
@@ -134,7 +134,7 @@ _start:
   xorq    bar@GOTPCREL(%rip), %r8
   testq   %r15, bar@GOTPCREL(%rip)
 
-## R_X86_64_REX2_GOTPCRELX
+## R_X86_64_CODE_4_GOTPCRELX
   adcq    bar@GOTPCREL(%rip), %r16
   addq    bar@GOTPCREL(%rip), %r17
   andq    bar@GOTPCREL(%rip), %r18
diff --git a/lld/test/ELF/x86-64-gotpc-relax.s b/lld/test/ELF/x86-64-gotpc-relax.s
index b1ff995b3fc21..bc5830ba7b629 100644
--- a/lld/test/ELF/x86-64-gotpc-relax.s
+++ b/lld/test/ELF/x86-64-gotpc-relax.s
@@ -1,5 +1,5 @@
 # REQUIRES: x86
-## Test R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX/R_X86_64_REX2_GOTPCRELX GOT optimization.
+## Test R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX/R_X86_64_CODE_4_GOTPCRELX GOT optimization.
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: ld.lld %t.o -o %t1 --no-apply-dynamic-relocs
diff --git a/lld/test/MachO/Inputs/liballowable_client.dylib b/lld/test/MachO/Inputs/liballowable_client.dylib
new file mode 100755
index 0000000000000..7c174a8a72a4c
Binary files /dev/null and b/lld/test/MachO/Inputs/liballowable_client.dylib differ
diff --git a/lld/test/MachO/allowable-client.s b/lld/test/MachO/allowable-client.s
new file mode 100644
index 0000000000000..3341dc59c1d81
--- /dev/null
+++ b/lld/test/MachO/allowable-client.s
@@ -0,0 +1,74 @@
+# REQUIRES: x86
+# RUN: rm -rf %t; split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
+
+# Check linking against a .dylib
+# RUN: not %lld -o %t/test %t/test.o -L%S/Inputs -lallowable_client 2>&1 | FileCheck %s --check-prefix=NOTALLOWED-IMPLICIT
+# RUN: not %lld -o %t/libtest_debug.exe %t/test.o -L%S/Inputs -lallowable_client 2>&1 | FileCheck %s --check-prefix=NOTALLOWED-IMPLICIT
+# RUN: not %lld -o %t/test %t/test.o -L%S/Inputs -lallowable_client -client_name notallowed 2>&1 | FileCheck %s --check-prefix=NOTALLOWED-EXPLICIT
+# RUN: %lld -o %t/test %t/test.o -L%S/Inputs -lallowable_client -client_name allowed
+# RUN: %lld -o %t/test %t/test.o -L%S/Inputs -lallowable_client -client_name all
+# RUN: %lld -o %t/all %t/test.o -L%S/Inputs -lallowable_client
+# RUN: %lld -o %t/allowed %t/test.o -L%S/Inputs -lallowable_client
+# RUN: %lld -o %t/liballowed_debug.exe %t/test.o -L%S/Inputs -lallowable_client
+
+# Check linking against a .tbd
+# RUN: not %lld -o %t/test %t/test.o -L%t -lallowable_client 2>&1 | FileCheck %s --check-prefix=NOTALLOWED-IMPLICIT
+# RUN: not %lld -o %t/libtest_debug.exe %t/test.o -L%t -lallowable_client 2>&1 | FileCheck %s --check-prefix=NOTALLOWED-IMPLICIT
+# RUN: not %lld -o %t/test %t/test.o -L%t -lallowable_client -client_name notallowed 2>&1 | FileCheck %s --check-prefix=NOTALLOWED-EXPLICIT
+# RUN: %lld -o %t/test %t/test.o -L%t -lallowable_client -client_name allowed
+# RUN: %lld -o %t/test %t/test.o -L%t -lallowable_client -client_name all
+# RUN: %lld -o %t/all %t/test.o -L%t -lallowable_client
+# RUN: %lld -o %t/allowed %t/test.o -L%t -lallowable_client
+# RUN: %lld -o %t/liballowed_debug.exe %t/test.o -L%t -lallowable_client
+
+# NOTALLOWED-IMPLICIT: error: cannot link directly with 'liballowable_client.dylib' because test is not an allowed client
+# NOTALLOWED-EXPLICIT: error: cannot link directly with 'liballowable_client.dylib' because notallowed is not an allowed client
+
+#--- test.s
+.text
+.globl _main
+_main:
+  ret
+
+#--- liballowable_client.tbd
+{
+  "main_library": {
+    "allowable_clients": [
+      {
+        "clients": [
+          "allowed"
+        ]
+      }
+    ],
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "lib/liballowable_client.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "10.11",
+        "target": "x86_64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/lld/test/MachO/icf-safe-thunks-dwarf.ll b/lld/test/MachO/icf-safe-thunks-dwarf.ll
index 1e4422a331323..9edad60946837 100644
--- a/lld/test/MachO/icf-safe-thunks-dwarf.ll
+++ b/lld/test/MachO/icf-safe-thunks-dwarf.ll
@@ -20,6 +20,59 @@
 ; VERIFY-STABS:  N_FUN{{.*}}_func_A
 ; VERIFY-STABS:  N_FUN{{.*}}_take_func_addr
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;; Check safe_thunks ICF + keeping STABS entries ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Check scenario with where we do safe_thunks ICF and also generate STABS entries
+; RUN: %lld -arch arm64 -lSystem --icf=safe_thunks --keep-icf-stabs -dylib -o %t/a_thunks.dylib %t/a.o
+; RUN: dsymutil -s %t/a_thunks.dylib > %t/a_thunks.txt
+
+
+; RUN: dsymutil --flat --verify-dwarf=none %t/a_thunks.dylib -o %t/a_thunks.dSYM
+; RUN: dsymutil -s %t/a_thunks.dSYM >> %t/a_thunks.txt
+; RUN: llvm-dwarfdump -a %t/a_thunks.dSYM >> %t/a_thunks.txt
+
+; RUN: cat %t/a_thunks.txt | FileCheck %s --check-prefix=VERIFY-THUNK
+
+# VERIFY-THUNK-LABEL: Symbol table for: '{{.*}}/a_thunks.dylib'
+# Capture the 'n_value's for N_FUN entries of _func_A, _func_B, and _func_C
+# VERIFY-THUNK:  [[MERGED_FUN_ADDR:[0-9a-f]+]] '_func_A'
+# VERIFY-THUNK:  [[MERGED_FUN_ADDR]] '_func_B'
+# VERIFY-THUNK:  [[MERGED_FUN_ADDR]] '_func_C'
+
+# Capture the 'n_value's for SECT EXT entries in the first part
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_A_NVALUE:[0-9a-f]+]] '_func_A'
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_B_NVALUE:[0-9a-f]+]] '_func_B'
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_C_NVALUE:[0-9a-f]+]] '_func_C'
+
+# VERIFY-THUNK: ----------------------------------------------------------------------
+# VERIFY-THUNK-LABEL: Symbol table for: '{{.*}}/a_thunks.dSYM'
+
+# Verify that the SECT EXT 'n_value's in the second part match the first part
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_A_NVALUE]] '_func_A'
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_B_NVALUE]] '_func_B'
+# VERIFY-THUNK: SECT EXT{{.*}} [[SECT_EXT_C_NVALUE]] '_func_C'
+
+# Ensure that N_FUN 'n_value's match the DW_TAG_subprogram's DW_AT_low_pc
+# and that the DW_AT_name is at a specific relative position
+
+# VERIFY-THUNK-LABEL: .debug_info contents:
+# VERIFY-THUNK: Compile Unit: length = {{.*}}
+
+# Match the subprogram for func_A
+# VERIFY-THUNK: :   DW_TAG_subprogram
+# VERIFY-THUNK-NEXT: {{ +}}DW_AT_low_pc	(0x[[MERGED_FUN_ADDR]])
+# VERIFY-THUNK-NEXT-NEXT-NEXT-NEXT-NEXT: {{ +}}DW_AT_name	("func_A")
+
+# Match the subprogram for func_B
+# VERIFY-THUNK: :   DW_TAG_subprogram
+# VERIFY-THUNK-NEXT: {{ +}}DW_AT_low_pc	(0x[[MERGED_FUN_ADDR]])
+# VERIFY-THUNK-NEXT-NEXT-NEXT-NEXT-NEXT: {{ +}}DW_AT_name	("func_B")
+
+# Match the subprogram for func_C
+# VERIFY-THUNK: :   DW_TAG_subprogram
+# VERIFY-THUNK-NEXT: {{ +}}DW_AT_low_pc	(0x[[MERGED_FUN_ADDR]])
+# VERIFY-THUNK-NEXT-NEXT-NEXT-NEXT-NEXT: {{ +}}DW_AT_name	("func_C")
+
 ;--- a.cpp
 #define ATTR __attribute__((noinline)) extern "C"
 typedef unsigned long long ULL;
diff --git a/lld/test/wasm/tls-non-shared-memory.s b/lld/test/wasm/tls-non-shared-memory.s
index 1754fd6254bb8..04fbb62228a7e 100644
--- a/lld/test/wasm/tls-non-shared-memory.s
+++ b/lld/test/wasm/tls-non-shared-memory.s
@@ -44,6 +44,7 @@ tls1:
 
 # RUN: wasm-ld --no-gc-sections --no-entry -o %t.wasm %t.o
 # RUN: obj2yaml %t.wasm | FileCheck %s
+# RUN: llvm-objdump --disassemble-symbols=get_tls1 --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --check-prefixes DIS
 
 # RUN: wasm-ld --experimental-pic -shared -o %t.so %t.o
 # RUN: obj2yaml %t.so | FileCheck %s --check-prefixes=SHARED,PIC
@@ -97,6 +98,14 @@ tls1:
 # CHECK-NEXT:        Content:         2A000000
 # CHECK-NEXT:  - Type:            CUSTOM
 
+# The constant value here which we add to `__tls_base` should not be absolute
+# but relative to `__tls_base`, in this case zero rather than 1024.
+# DIS:      <get_tls1>:
+# DIS-EMPTY:
+# DIS-NEXT:  global.get 1
+# DIS-NEXT:  i32.const 0
+# DIS-NEXT:  i32.add
+# DIS-NEXT:  end
 
 # In PIC mode we expect TLS data and non-TLS data to be merged into
 # a single segment which is initialized via the  __memory_base import
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index b2bbd11c53ef2..e62e7bec609f5 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -310,12 +310,11 @@ uint32_t DefinedFunction::getExportedFunctionIndex() const {
   return function->getFunctionIndex();
 }
 
-uint64_t DefinedData::getVA() const {
+uint64_t DefinedData::getVA(bool absolute) const {
   LLVM_DEBUG(dbgs() << "getVA: " << getName() << "\n");
-  // In the shared memory case, TLS symbols are relative to the start of the TLS
-  // output segment (__tls_base).  When building without shared memory, TLS
-  // symbols absolute, just like non-TLS.
-  if (isTLS() && config->sharedMemory)
+  // TLS symbols (by default) are relative to the start of the TLS output
+  // segment (__tls_base).
+  if (isTLS() && !absolute)
     return getOutputSegmentOffset();
   if (segment)
     return segment->getVA(value);
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 5ce3ecbc4ab19..80b658773bd20 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -315,7 +315,9 @@ class DefinedData : public DataSymbol {
   static bool classof(const Symbol *s) { return s->kind() == DefinedDataKind; }
 
   // Returns the output virtual address of a defined data symbol.
-  uint64_t getVA() const;
+  // For TLS symbols, by default (unless absolute is set), this returns an
+  // address relative the `__tls_base`.
+  uint64_t getVA(bool absolute = false) const;
   void setVA(uint64_t va);
 
   // Returns the offset of a defined data symbol within its OutputSegment.
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index a3bc90cfe759c..1454c3324af98 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -514,7 +514,10 @@ void GlobalSection::writeBody() {
     } else {
       WasmInitExpr initExpr;
       if (auto *d = dyn_cast<DefinedData>(sym))
-        initExpr = intConst(d->getVA(), is64);
+        // In the sharedMemory case TLS globals are set during
+        // `__wasm_apply_global_tls_relocs`, but in the non-shared case
+        // we know the absolute value at link time.
+        initExpr = intConst(d->getVA(/*absolute=*/!config->sharedMemory), is64);
       else if (auto *f = dyn_cast<FunctionSymbol>(sym))
         initExpr = intConst(f->isStub ? 0 : f->getTableIndex(), is64);
       else {
diff --git a/lldb/bindings/interface/SBTargetExtensions.i b/lldb/bindings/interface/SBTargetExtensions.i
index d756a351a810a..43125d8970615 100644
--- a/lldb/bindings/interface/SBTargetExtensions.i
+++ b/lldb/bindings/interface/SBTargetExtensions.i
@@ -79,11 +79,11 @@ STRING_EXTENSION_LEVEL_OUTSIDE(SBTarget, lldb::eDescriptionLevelBrief)
                         module = self.sbtarget.GetModuleAtIndex(idx)
                         if module.uuid == key:
                             return module
-                elif type(key) is re.SRE_Pattern:
+                elif isinstance(key, type(re.compile(''))):
                     matching_modules = []
                     for idx in range(num_modules):
                         module = self.sbtarget.GetModuleAtIndex(idx)
-                        re_match = key.search(module.path.fullpath)
+                        re_match = key.search(module.file.fullpath)
                         if re_match:
                             matching_modules.append(module)
                     return matching_modules
diff --git a/lldb/docs/use/links.rst b/lldb/docs/use/links.rst
index 595a78c8db804..c9e29ed78a219 100644
--- a/lldb/docs/use/links.rst
+++ b/lldb/docs/use/links.rst
@@ -16,6 +16,15 @@ code.
 Videos
 ------
 
+`Run, Break, Inspect: Explore effective debugging in LLDB (2024)`_
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Learn how to use LLDB to explore and debug codebases. We'll show you
+how to make the most of crashlogs and backtraces, and how to
+supercharge breakpoints with actions and complex stop
+conditions. We'll also explore how the “p” command and the latest
+features can enhance your debugging experience.
+
 `LLDB: Beyond “po” (2019)`_
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -33,23 +42,6 @@ custom breakpoints for more powerful debugging. Get the most out of
 Xcode’s view debugging tools to solve UI issues in your app more
 efficiently.
 
-`Debugging with LLDB (2012)`_
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-LLDB is the next-generation debugger for macOS and iOS. Get an
-introduction to using LLDB via the console interface and within Xcode’s
-graphical debugger. The team that created LLDB will demonstrate the
-latest features and improvements, helping you track down bugs more
-efficiently than ever before.
-
-`Migrating from GDB to LLDB (2011)`_
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-LLDB is the next-generation debugger for macOS and iOS. Discover why
-you’ll want to start using LLDB in your own development, get expert tips
-from the team that created LLDB, and see how it will help you track down
-bugs more efficiently than ever before.
-
 Books
 -----
 
@@ -73,10 +65,9 @@ iOS apps.
 A collection of LLDB aliases/regexes and Python scripts.
 
 .. _Dancing in the Debugger — A Waltz with LLDB (2014): https://www.objc.io/issues/19-debugging/lldb-debugging/
+.. _`Run, Break, Inspect: Explore effective debugging in LLDB (2024)`: https://developer.apple.com/videos/play/wwdc2024/10198
 .. _`LLDB: Beyond “po” (2019)`: https://developer.apple.com/videos/play/wwdc2019/429/
 .. _Advanced Debugging with Xcode and LLDB (2018): https://developer.apple.com/videos/play/wwdc2018/412/
-.. _Debugging with LLDB (2012): https://developer.apple.com/videos/play/wwdc2012/415/
-.. _Migrating from GDB to LLDB (2011): https://developer.apple.com/videos/play/wwdc2011/321/
 .. _Advanced Apple Debugging & Reverse Engineering (2018): https://www.raywenderlich.com/books/advanced-apple-debugging-reverse-engineering/
 .. _facebook/chisel: https://github.com/facebook/chisel
 .. _DerekSelander/LLDB: https://github.com/DerekSelander/LLDB
diff --git a/lldb/include/lldb/API/SBBreakpointName.h b/lldb/include/lldb/API/SBBreakpointName.h
index 838c66385bd12..4b7ad0cce345e 100644
--- a/lldb/include/lldb/API/SBBreakpointName.h
+++ b/lldb/include/lldb/API/SBBreakpointName.h
@@ -17,10 +17,6 @@ namespace lldb {
 
 class LLDB_API SBBreakpointName {
 public:
-//  typedef bool (*BreakpointHitCallback)(void *baton, SBProcess &process,
-//                                        SBThread &thread,
-//                                        lldb::SBBreakpointLocation &location);
-
   SBBreakpointName();
   
   SBBreakpointName(SBTarget &target, const char *name);
diff --git a/lldb/include/lldb/API/SBFrame.h b/lldb/include/lldb/API/SBFrame.h
index e0d15c3ecc5b1..3635ee5a537ad 100644
--- a/lldb/include/lldb/API/SBFrame.h
+++ b/lldb/include/lldb/API/SBFrame.h
@@ -122,6 +122,11 @@ class LLDB_API SBFrame {
   lldb::SBValue EvaluateExpression(const char *expr,
                                    const SBExpressionOptions &options);
 
+  /// Language plugins can use this API to report language-specific
+  /// runtime information about this compile unit, such as additional
+  /// language version details or feature flags.
+  SBStructuredData GetLanguageSpecificData() const;
+
   /// Gets the lexical block that defines the stack frame. Another way to think
   /// of this is it will return the block that contains all of the variables
   /// for a stack frame. Inlined functions are represented as SBBlock objects
diff --git a/lldb/include/lldb/API/SBStructuredData.h b/lldb/include/lldb/API/SBStructuredData.h
index ccdd12cab94b2..c0d214a7374c6 100644
--- a/lldb/include/lldb/API/SBStructuredData.h
+++ b/lldb/include/lldb/API/SBStructuredData.h
@@ -114,6 +114,7 @@ class SBStructuredData {
   friend class SBCommandReturnObject;
   friend class SBLaunchInfo;
   friend class SBDebugger;
+  friend class SBFrame;
   friend class SBTarget;
   friend class SBProcess;
   friend class SBThread;
diff --git a/lldb/include/lldb/Host/Editline.h b/lldb/include/lldb/Host/Editline.h
index 57e2c831e3499..e8e8a6c0d4f67 100644
--- a/lldb/include/lldb/Host/Editline.h
+++ b/lldb/include/lldb/Host/Editline.h
@@ -238,6 +238,8 @@ class Editline {
   /// Convert the current input lines into a UTF8 StringList
   StringList GetInputAsStringList(int line_count = UINT32_MAX);
 
+  size_t GetTerminalWidth() { return m_terminal_width; }
+
 private:
   /// Sets the lowest line number for multi-line editing sessions.  A value of
   /// zero suppresses line number printing in the prompt.
diff --git a/lldb/include/lldb/Host/MainLoopBase.h b/lldb/include/lldb/Host/MainLoopBase.h
index 7365ee7a65ee6..be9a2676e7443 100644
--- a/lldb/include/lldb/Host/MainLoopBase.h
+++ b/lldb/include/lldb/Host/MainLoopBase.h
@@ -13,8 +13,10 @@
 #include "lldb/Utility/Status.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <chrono>
 #include <functional>
 #include <mutex>
+#include <queue>
 
 namespace lldb_private {
 
@@ -38,6 +40,9 @@ class MainLoopBase {
   class ReadHandle;
 
 public:
+  using TimePoint = std::chrono::time_point<std::chrono::steady_clock,
+                                            std::chrono::nanoseconds>;
+
   MainLoopBase() : m_terminate_request(false) {}
   virtual ~MainLoopBase() = default;
 
@@ -52,7 +57,18 @@ class MainLoopBase {
   // Add a pending callback that will be executed once after all the pending
   // events are processed. The callback will be executed even if termination
   // was requested.
-  void AddPendingCallback(const Callback &callback);
+  void AddPendingCallback(const Callback &callback) {
+    AddCallback(callback, std::chrono::steady_clock::time_point());
+  }
+
+  // Add a callback that will be executed after a certain amount of time has
+  // passed.
+  void AddCallback(const Callback &callback, std::chrono::nanoseconds delay) {
+    AddCallback(callback, std::chrono::steady_clock::now() + delay);
+  }
+
+  // Add a callback that will be executed after a given point in time.
+  void AddCallback(const Callback &callback, TimePoint point);
 
   // Waits for registered events and invoke the proper callbacks. Returns when
   // all callbacks deregister themselves or when someone requests termination.
@@ -69,14 +85,18 @@ class MainLoopBase {
 
   virtual void UnregisterReadObject(IOObject::WaitableHandle handle) = 0;
 
-  // Interrupt the loop that is currently waiting for events and execute
-  // the current pending callbacks immediately.
-  virtual void TriggerPendingCallbacks() = 0;
+  // Interrupt the loop that is currently waiting for events.
+  virtual void Interrupt() = 0;
+
+  void ProcessCallbacks();
 
-  void ProcessPendingCallbacks();
+  std::optional<TimePoint> GetNextWakeupTime();
 
   std::mutex m_callback_mutex;
-  std::vector<Callback> m_pending_callbacks;
+  std::priority_queue<std::pair<TimePoint, Callback>,
+                      std::vector<std::pair<TimePoint, Callback>>,
+                      llvm::on_first<std::greater<TimePoint>>>
+      m_callbacks;
   bool m_terminate_request : 1;
 
 private:
diff --git a/lldb/include/lldb/Host/posix/MainLoopPosix.h b/lldb/include/lldb/Host/posix/MainLoopPosix.h
index 1988dde7c65ae..e9ac798b948df 100644
--- a/lldb/include/lldb/Host/posix/MainLoopPosix.h
+++ b/lldb/include/lldb/Host/posix/MainLoopPosix.h
@@ -54,7 +54,7 @@ class MainLoopPosix : public MainLoopBase {
   void UnregisterReadObject(IOObject::WaitableHandle handle) override;
   void UnregisterSignal(int signo, std::list<Callback>::iterator callback_it);
 
-  void TriggerPendingCallbacks() override;
+  void Interrupt() override;
 
 private:
   void ProcessReadObject(IOObject::WaitableHandle handle);
@@ -88,8 +88,8 @@ class MainLoopPosix : public MainLoopBase {
 
   llvm::DenseMap<IOObject::WaitableHandle, Callback> m_read_fds;
   llvm::DenseMap<int, SignalInfo> m_signals;
-  Pipe m_trigger_pipe;
-  std::atomic<bool> m_triggering;
+  Pipe m_interrupt_pipe;
+  std::atomic<bool> m_interrupting = false;
 #if HAVE_SYS_EVENT_H
   int m_kqueue;
 #endif
diff --git a/lldb/include/lldb/Host/windows/MainLoopWindows.h b/lldb/include/lldb/Host/windows/MainLoopWindows.h
index 33e179e6c1286..3937a24645d95 100644
--- a/lldb/include/lldb/Host/windows/MainLoopWindows.h
+++ b/lldb/include/lldb/Host/windows/MainLoopWindows.h
@@ -34,7 +34,7 @@ class MainLoopWindows : public MainLoopBase {
 protected:
   void UnregisterReadObject(IOObject::WaitableHandle handle) override;
 
-  void TriggerPendingCallbacks() override;
+  void Interrupt() override;
 
 private:
   void ProcessReadObject(IOObject::WaitableHandle handle);
@@ -45,7 +45,7 @@ class MainLoopWindows : public MainLoopBase {
     Callback callback;
   };
   llvm::DenseMap<IOObject::WaitableHandle, FdInfo> m_read_fds;
-  void *m_trigger_event;
+  void *m_interrupt_event;
 };
 
 } // namespace lldb_private
diff --git a/lldb/include/lldb/Target/LanguageRuntime.h b/lldb/include/lldb/Target/LanguageRuntime.h
index 21bdc61b8cbcf..4a0214b04e235 100644
--- a/lldb/include/lldb/Target/LanguageRuntime.h
+++ b/lldb/include/lldb/Target/LanguageRuntime.h
@@ -241,6 +241,11 @@ class LanguageRuntime : public Runtime, public PluginInterface {
                        lldb_private::RegisterContext *regctx,
                        bool &behaves_like_zeroth_frame);
 
+  /// Language runtime plugins can use this API to report
+  /// language-specific runtime information about this compile unit,
+  /// such as additional language version details or feature flags.
+  virtual StructuredData::ObjectSP GetLanguageSpecificData(SymbolContext sc);
+
 protected:
   // The static GetRuntimeUnwindPlan method above is only implemented in the
   // base class; subclasses may override this protected member if they can
diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h
index e85430791b7d9..3f51c9a7f22f0 100644
--- a/lldb/include/lldb/Target/StackFrame.h
+++ b/lldb/include/lldb/Target/StackFrame.h
@@ -22,6 +22,7 @@
 #include "lldb/Utility/Scalar.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
+#include "lldb/Utility/StructuredData.h"
 #include "lldb/Utility/UserID.h"
 #include "lldb/ValueObject/ValueObjectList.h"
 
@@ -408,6 +409,11 @@ class StackFrame : public ExecutionContextScope,
   /// system implementation details this way.
   bool IsHidden();
 
+  /// Language plugins can use this API to report language-specific
+  /// runtime information about this compile unit, such as additional
+  /// language version details or feature flags.
+  StructuredData::ObjectSP GetLanguageSpecificData();
+
   /// Get the frame's demangled name.
   ///
   ///  /// \return
diff --git a/lldb/include/lldb/Target/ThreadPlanStack.h b/lldb/include/lldb/Target/ThreadPlanStack.h
index e6a560a509261..e0f8104de9a4d 100644
--- a/lldb/include/lldb/Target/ThreadPlanStack.h
+++ b/lldb/include/lldb/Target/ThreadPlanStack.h
@@ -14,6 +14,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "llvm/Support/RWMutex.h"
+
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/lldb-private-forward.h"
@@ -96,9 +98,12 @@ class ThreadPlanStack {
   void ClearThreadCache();
 
 private:
-  void PrintOneStack(Stream &s, llvm::StringRef stack_name,
-                     const PlanStack &stack, lldb::DescriptionLevel desc_level,
-                     bool include_internal) const;
+  lldb::ThreadPlanSP DiscardPlanNoLock();
+  lldb::ThreadPlanSP GetCurrentPlanNoLock() const;
+  void PrintOneStackNoLock(Stream &s, llvm::StringRef stack_name,
+                           const PlanStack &stack,
+                           lldb::DescriptionLevel desc_level,
+                           bool include_internal) const;
 
   PlanStack m_plans;           ///< The stack of plans this thread is executing.
   PlanStack m_completed_plans; ///< Plans that have been completed by this
@@ -110,7 +115,7 @@ class ThreadPlanStack {
   size_t m_completed_plan_checkpoint = 0; // Monotonically increasing token for
                                           // completed plan checkpoints.
   std::unordered_map<size_t, PlanStack> m_completed_plan_store;
-  mutable std::recursive_mutex m_stack_mutex;
+  mutable llvm::sys::RWMutex m_stack_mutex;
 };
 
 class ThreadPlanStackMap {
diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp
index e2c691fa9bfd4..2300bec4d685d 100644
--- a/lldb/source/API/SBFrame.cpp
+++ b/lldb/source/API/SBFrame.cpp
@@ -47,6 +47,7 @@
 #include "lldb/API/SBExpressionOptions.h"
 #include "lldb/API/SBFormat.h"
 #include "lldb/API/SBStream.h"
+#include "lldb/API/SBStructuredData.h"
 #include "lldb/API/SBSymbolContext.h"
 #include "lldb/API/SBThread.h"
 #include "lldb/API/SBValue.h"
@@ -1154,6 +1155,21 @@ lldb::SBValue SBFrame::EvaluateExpression(const char *expr,
   return expr_result;
 }
 
+SBStructuredData SBFrame::GetLanguageSpecificData() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  SBStructuredData sb_data;
+  std::unique_lock<std::recursive_mutex> lock;
+  ExecutionContext exe_ctx(m_opaque_sp.get(), lock);
+  StackFrame *frame = exe_ctx.GetFramePtr();
+  if (!frame)
+    return sb_data;
+
+  StructuredData::ObjectSP data(frame->GetLanguageSpecificData());
+  sb_data.m_impl_up->SetObjectSP(data);
+  return sb_data;
+}
+
 bool SBFrame::IsInlined() {
   LLDB_INSTRUMENT_VA(this);
 
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index f95f854c5f220..c9e890304ae1e 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -927,12 +927,86 @@ unsigned char Editline::BufferEndCommand(int ch) {
 static void
 PrintCompletion(FILE *output_file,
                 llvm::ArrayRef<CompletionResult::Completion> results,
-                size_t max_len) {
+                size_t max_completion_length, size_t max_length) {
+  constexpr size_t ellipsis_length = 3;
+  constexpr size_t padding_length = 8;
+  constexpr size_t separator_length = 4;
+
+  const size_t description_col =
+      std::min(max_completion_length + padding_length, max_length);
+
   for (const CompletionResult::Completion &c : results) {
-    fprintf(output_file, "\t%-*s", (int)max_len, c.GetCompletion().c_str());
-    if (!c.GetDescription().empty())
-      fprintf(output_file, " -- %s", c.GetDescription().c_str());
-    fprintf(output_file, "\n");
+    if (c.GetCompletion().empty())
+      continue;
+
+    // Print the leading padding.
+    fprintf(output_file, "        ");
+
+    // Print the completion with trailing padding to the description column if
+    // that fits on the screen. Otherwise print whatever fits on the screen
+    // followed by ellipsis.
+    const size_t completion_length = c.GetCompletion().size();
+    if (padding_length + completion_length < max_length) {
+      fprintf(output_file, "%-*s",
+              static_cast<int>(description_col - padding_length),
+              c.GetCompletion().c_str());
+    } else {
+      // If the completion doesn't fit on the screen, print ellipsis and don't
+      // bother with the description.
+      fprintf(output_file, "%.*s...\n",
+              static_cast<int>(max_length - padding_length - ellipsis_length),
+              c.GetCompletion().c_str());
+      continue;
+    }
+
+    // If we don't have a description, or we don't have enough space left to
+    // print the separator followed by the ellipsis, we're done.
+    if (c.GetDescription().empty() ||
+        description_col + separator_length + ellipsis_length >= max_length) {
+      fprintf(output_file, "\n");
+      continue;
+    }
+
+    // Print the separator.
+    fprintf(output_file, " -- ");
+
+    // Descriptions can contain newlines. We want to print them below each
+    // other, aligned after the separator. For example, foo has a
+    // two-line description:
+    //
+    // foo   -- Something that fits on the line.
+    //          More information below.
+    //
+    // However, as soon as a line exceed the available screen width and
+    // print ellipsis, we don't print the next line. For example, foo has a
+    // three-line description:
+    //
+    // foo   -- Something that fits on the line.
+    //          Something much longer  that doesn't fit...
+    //
+    // Because we had to print ellipsis on line two, we don't print the
+    // third line.
+    bool first = true;
+    for (llvm::StringRef line : llvm::split(c.GetDescription(), '\n')) {
+      if (line.empty())
+        break;
+      if (!first)
+        fprintf(output_file, "%*s",
+                static_cast<int>(description_col + separator_length), "");
+
+      first = false;
+      const size_t position = description_col + separator_length;
+      const size_t description_length = line.size();
+      if (position + description_length < max_length) {
+        fprintf(output_file, "%.*s\n", static_cast<int>(description_length),
+                line.data());
+      } else {
+        fprintf(output_file, "%.*s...\n",
+                static_cast<int>(max_length - position - ellipsis_length),
+                line.data());
+        continue;
+      }
+    }
   }
 }
 
@@ -953,7 +1027,8 @@ void Editline::DisplayCompletions(
   const size_t max_len = longest->GetCompletion().size();
 
   if (results.size() < page_size) {
-    PrintCompletion(editline.m_output_file, results, max_len);
+    PrintCompletion(editline.m_output_file, results, max_len,
+                    editline.GetTerminalWidth());
     return;
   }
 
@@ -963,7 +1038,7 @@ void Editline::DisplayCompletions(
     size_t next_size = all ? remaining : std::min(page_size, remaining);
 
     PrintCompletion(editline.m_output_file, results.slice(cur_pos, next_size),
-                    max_len);
+                    max_len, editline.GetTerminalWidth());
 
     cur_pos += next_size;
 
diff --git a/lldb/source/Host/common/MainLoopBase.cpp b/lldb/source/Host/common/MainLoopBase.cpp
index 030a4f0371681..64a57e65849e9 100644
--- a/lldb/source/Host/common/MainLoopBase.cpp
+++ b/lldb/source/Host/common/MainLoopBase.cpp
@@ -7,27 +7,43 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/MainLoopBase.h"
+#include <chrono>
 
 using namespace lldb;
 using namespace lldb_private;
 
-void MainLoopBase::AddPendingCallback(const Callback &callback) {
+void MainLoopBase::AddCallback(const Callback &callback, TimePoint point) {
+  bool interrupt_needed;
   {
     std::lock_guard<std::mutex> lock{m_callback_mutex};
-    m_pending_callbacks.push_back(callback);
+    // We need to interrupt the main thread if this callback is scheduled to
+    // execute at an earlier time than the earliest callback registered so far.
+    interrupt_needed = m_callbacks.empty() || point < m_callbacks.top().first;
+    m_callbacks.emplace(point, callback);
   }
-  TriggerPendingCallbacks();
+  if (interrupt_needed)
+    Interrupt();
 }
 
-void MainLoopBase::ProcessPendingCallbacks() {
-  // Move the callbacks to a local vector to avoid keeping m_pending_callbacks
-  // locked throughout the calls.
-  std::vector<Callback> pending_callbacks;
-  {
-    std::lock_guard<std::mutex> lock{m_callback_mutex};
-    pending_callbacks = std::move(m_pending_callbacks);
-  }
+void MainLoopBase::ProcessCallbacks() {
+  while (true) {
+    Callback callback;
+    {
+      std::lock_guard<std::mutex> lock{m_callback_mutex};
+      if (m_callbacks.empty() ||
+          std::chrono::steady_clock::now() < m_callbacks.top().first)
+        return;
+      callback = std::move(m_callbacks.top().second);
+      m_callbacks.pop();
+    }
 
-  for (const Callback &callback : pending_callbacks)
     callback(*this);
+  }
+}
+
+std::optional<MainLoopBase::TimePoint> MainLoopBase::GetNextWakeupTime() {
+  std::lock_guard<std::mutex> lock(m_callback_mutex);
+  if (m_callbacks.empty())
+    return std::nullopt;
+  return m_callbacks.top().first;
 }
diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp
index 46993aea9cb10..1715610e0f84f 100644
--- a/lldb/source/Host/posix/MainLoopPosix.cpp
+++ b/lldb/source/Host/posix/MainLoopPosix.cpp
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cerrno>
+#include <chrono>
 #include <csignal>
 #include <ctime>
 #include <fcntl.h>
@@ -68,6 +69,30 @@ static void SignalHandler(int signo, siginfo_t *info, void *) {
   (void)bytes_written;
 }
 
+class ToTimeSpec {
+public:
+  explicit ToTimeSpec(std::optional<MainLoopPosix::TimePoint> point) {
+    using namespace std::chrono;
+
+    if (!point) {
+      m_ts_ptr = nullptr;
+      return;
+    }
+    nanoseconds dur = std::max(*point - steady_clock::now(), nanoseconds(0));
+    m_ts_ptr = &m_ts;
+    m_ts.tv_sec = duration_cast<seconds>(dur).count();
+    m_ts.tv_nsec = (dur % seconds(1)).count();
+  }
+  ToTimeSpec(const ToTimeSpec &) = delete;
+  ToTimeSpec &operator=(const ToTimeSpec &) = delete;
+
+  operator struct timespec *() { return m_ts_ptr; }
+
+private:
+  struct timespec m_ts;
+  struct timespec *m_ts_ptr;
+};
+
 class MainLoopPosix::RunImpl {
 public:
   RunImpl(MainLoopPosix &loop);
@@ -100,8 +125,9 @@ Status MainLoopPosix::RunImpl::Poll() {
   for (auto &fd : loop.m_read_fds)
     EV_SET(&in_events[i++], fd.first, EVFILT_READ, EV_ADD, 0, 0, 0);
 
-  num_events = kevent(loop.m_kqueue, in_events.data(), in_events.size(),
-                      out_events, std::size(out_events), nullptr);
+  num_events =
+      kevent(loop.m_kqueue, in_events.data(), in_events.size(), out_events,
+             std::size(out_events), ToTimeSpec(loop.GetNextWakeupTime()));
 
   if (num_events < 0) {
     if (errno == EINTR) {
@@ -145,7 +171,7 @@ Status MainLoopPosix::RunImpl::Poll() {
   }
 
   if (ppoll(read_fds.data(), read_fds.size(),
-            /*timeout=*/nullptr,
+            ToTimeSpec(loop.GetNextWakeupTime()),
             /*sigmask=*/nullptr) == -1 &&
       errno != EINTR)
     return Status(errno, eErrorTypePOSIX);
@@ -166,27 +192,28 @@ void MainLoopPosix::RunImpl::ProcessReadEvents() {
 }
 #endif
 
-MainLoopPosix::MainLoopPosix() : m_triggering(false) {
-  Status error = m_trigger_pipe.CreateNew(/*child_process_inherit=*/false);
+MainLoopPosix::MainLoopPosix() {
+  Status error = m_interrupt_pipe.CreateNew(/*child_process_inherit=*/false);
   assert(error.Success());
 
   // Make the write end of the pipe non-blocking.
-  int result = fcntl(m_trigger_pipe.GetWriteFileDescriptor(), F_SETFL,
-                     fcntl(m_trigger_pipe.GetWriteFileDescriptor(), F_GETFL) |
+  int result = fcntl(m_interrupt_pipe.GetWriteFileDescriptor(), F_SETFL,
+                     fcntl(m_interrupt_pipe.GetWriteFileDescriptor(), F_GETFL) |
                          O_NONBLOCK);
   assert(result == 0);
   UNUSED_IF_ASSERT_DISABLED(result);
 
-  const int trigger_pipe_fd = m_trigger_pipe.GetReadFileDescriptor();
-  m_read_fds.insert({trigger_pipe_fd, [trigger_pipe_fd](MainLoopBase &loop) {
-                       char c;
-                       ssize_t bytes_read = llvm::sys::RetryAfterSignal(
-                           -1, ::read, trigger_pipe_fd, &c, 1);
-                       assert(bytes_read == 1);
-                       UNUSED_IF_ASSERT_DISABLED(bytes_read);
-                       // NB: This implicitly causes another loop iteration
-                       // and therefore the execution of pending callbacks.
-                     }});
+  const int interrupt_pipe_fd = m_interrupt_pipe.GetReadFileDescriptor();
+  m_read_fds.insert(
+      {interrupt_pipe_fd, [interrupt_pipe_fd](MainLoopBase &loop) {
+         char c;
+         ssize_t bytes_read =
+             llvm::sys::RetryAfterSignal(-1, ::read, interrupt_pipe_fd, &c, 1);
+         assert(bytes_read == 1);
+         UNUSED_IF_ASSERT_DISABLED(bytes_read);
+         // NB: This implicitly causes another loop iteration
+         // and therefore the execution of pending callbacks.
+       }});
 #if HAVE_SYS_EVENT_H
   m_kqueue = kqueue();
   assert(m_kqueue >= 0);
@@ -197,8 +224,8 @@ MainLoopPosix::~MainLoopPosix() {
 #if HAVE_SYS_EVENT_H
   close(m_kqueue);
 #endif
-  m_read_fds.erase(m_trigger_pipe.GetReadFileDescriptor());
-  m_trigger_pipe.Close();
+  m_read_fds.erase(m_interrupt_pipe.GetReadFileDescriptor());
+  m_interrupt_pipe.Close();
   assert(m_read_fds.size() == 0); 
   assert(m_signals.size() == 0);
 }
@@ -245,11 +272,9 @@ MainLoopPosix::RegisterSignal(int signo, const Callback &callback,
   sigset_t old_set;
 
   // Set signal info before installing the signal handler!
-  g_signal_info[signo].pipe_fd = m_trigger_pipe.GetWriteFileDescriptor();
+  g_signal_info[signo].pipe_fd = m_interrupt_pipe.GetWriteFileDescriptor();
   g_signal_info[signo].flag = 0;
 
-  // Even if using kqueue, the signal handler will still be invoked, so it's
-  // important to replace it with our "benign" handler.
   int ret = sigaction(signo, &new_action, &info.old_action);
   UNUSED_IF_ASSERT_DISABLED(ret);
   assert(ret == 0 && "sigaction failed");
@@ -308,8 +333,8 @@ Status MainLoopPosix::Run() {
 
     ProcessSignals();
 
-    m_triggering = false;
-    ProcessPendingCallbacks();
+    m_interrupting = false;
+    ProcessCallbacks();
   }
   return Status();
 }
@@ -347,13 +372,13 @@ void MainLoopPosix::ProcessSignal(int signo) {
   }
 }
 
-void MainLoopPosix::TriggerPendingCallbacks() {
-  if (m_triggering.exchange(true))
+void MainLoopPosix::Interrupt() {
+  if (m_interrupting.exchange(true))
     return;
 
   char c = '.';
   size_t bytes_written;
-  Status error = m_trigger_pipe.Write(&c, 1, bytes_written);
+  Status error = m_interrupt_pipe.Write(&c, 1, bytes_written);
   assert(error.Success());
   UNUSED_IF_ASSERT_DISABLED(error);
   assert(bytes_written == 1);
diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp
index c9aa6d339d8f4..0a5a35e9db9dd 100644
--- a/lldb/source/Host/windows/MainLoopWindows.cpp
+++ b/lldb/source/Host/windows/MainLoopWindows.cpp
@@ -21,14 +21,24 @@
 using namespace lldb;
 using namespace lldb_private;
 
+static DWORD ToTimeout(std::optional<MainLoopWindows::TimePoint> point) {
+  using namespace std::chrono;
+
+  if (!point)
+    return WSA_INFINITE;
+
+  nanoseconds dur = (std::max)(*point - steady_clock::now(), nanoseconds(0));
+  return duration_cast<milliseconds>(dur).count();
+}
+
 MainLoopWindows::MainLoopWindows() {
-  m_trigger_event = WSACreateEvent();
-  assert(m_trigger_event != WSA_INVALID_EVENT);
+  m_interrupt_event = WSACreateEvent();
+  assert(m_interrupt_event != WSA_INVALID_EVENT);
 }
 
 MainLoopWindows::~MainLoopWindows() {
   assert(m_read_fds.empty());
-  BOOL result = WSACloseEvent(m_trigger_event);
+  BOOL result = WSACloseEvent(m_interrupt_event);
   assert(result == TRUE);
   UNUSED_IF_ASSERT_DISABLED(result);
 }
@@ -43,10 +53,11 @@ llvm::Expected<size_t> MainLoopWindows::Poll() {
 
     events.push_back(info.event);
   }
-  events.push_back(m_trigger_event);
+  events.push_back(m_interrupt_event);
 
-  DWORD result = WSAWaitForMultipleEvents(events.size(), events.data(), FALSE,
-                                          WSA_INFINITE, FALSE);
+  DWORD result =
+      WSAWaitForMultipleEvents(events.size(), events.data(), FALSE,
+                               ToTimeout(GetNextWakeupTime()), FALSE);
 
   for (auto &fd : m_read_fds) {
     int result = WSAEventSelect(fd.first, WSA_INVALID_EVENT, 0);
@@ -54,9 +65,13 @@ llvm::Expected<size_t> MainLoopWindows::Poll() {
     UNUSED_IF_ASSERT_DISABLED(result);
   }
 
-  if (result >= WSA_WAIT_EVENT_0 && result <= WSA_WAIT_EVENT_0 + events.size())
+  if (result >= WSA_WAIT_EVENT_0 && result < WSA_WAIT_EVENT_0 + events.size())
     return result - WSA_WAIT_EVENT_0;
 
+  // A timeout is treated as a (premature) signalization of the interrupt event.
+  if (result == WSA_WAIT_TIMEOUT)
+    return events.size() - 1;
+
   return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                  "WSAWaitForMultipleEvents failed");
 }
@@ -127,13 +142,11 @@ Status MainLoopWindows::Run() {
       ProcessReadObject(KV.first);
     } else {
       assert(*signaled_event == m_read_fds.size());
-      WSAResetEvent(m_trigger_event);
+      WSAResetEvent(m_interrupt_event);
     }
-    ProcessPendingCallbacks();
+    ProcessCallbacks();
   }
   return Status();
 }
 
-void MainLoopWindows::TriggerPendingCallbacks() {
-  WSASetEvent(m_trigger_event);
-}
+void MainLoopWindows::Interrupt() { WSASetEvent(m_interrupt_event); }
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index 4c794b81809c6..0083b49965697 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -3398,6 +3398,14 @@ std::optional<uint64_t> AppleObjCRuntimeV2::GetSharedCacheImageHeaderVersion() {
   return std::nullopt;
 }
 
+StructuredData::ObjectSP
+AppleObjCRuntimeV2::GetLanguageSpecificData(SymbolContext sc) {
+  auto dict_up = std::make_unique<StructuredData::Dictionary>();
+  dict_up->AddItem("Objective-C runtime version",
+                   std::make_unique<StructuredData::UnsignedInteger>(2));
+  return dict_up;
+}
+
 #pragma mark Frame recognizers
 
 class ObjCExceptionRecognizedStackFrame : public RecognizedStackFrame {
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.h b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.h
index c9d0b3a907b54..2422539b13f13 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.h
@@ -106,6 +106,8 @@ class AppleObjCRuntimeV2 : public AppleObjCRuntime {
 
   std::optional<uint64_t> GetSharedCacheImageHeaderVersion();
 
+  StructuredData::ObjectSP GetLanguageSpecificData(SymbolContext sc) override;
+
 protected:
   lldb::BreakpointResolverSP
   CreateExceptionResolver(const lldb::BreakpointSP &bkpt, bool catch_bp,
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 9c7dff8127f47..eac9ab4577d3e 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -22,6 +22,7 @@
 #include "lldb/Host/LZMA.h"
 #include "lldb/Symbol/DWARFCallFrameInfo.h"
 #include "lldb/Symbol/SymbolContext.h"
+#include "lldb/Target/Process.h"
 #include "lldb/Target/SectionLoadList.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/ArchSpec.h"
@@ -3017,6 +3018,19 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) {
           ParseSymbolTable(&lldb_symtab, symbol_id, dynsym);
       symbol_id += num_symbols;
       m_address_class_map.merge(address_class_map);
+    } else {
+      // Try and read the dynamic symbol table from the .dynamic section.
+      uint32_t dynamic_num_symbols = 0;
+      std::optional<DataExtractor> symtab_data =
+          GetDynsymDataFromDynamic(dynamic_num_symbols);
+      std::optional<DataExtractor> strtab_data = GetDynstrData();
+      if (symtab_data && strtab_data) {
+        auto [num_symbols_parsed, address_class_map] = ParseSymbols(
+            &lldb_symtab, symbol_id, section_list, dynamic_num_symbols,
+            symtab_data.value(), strtab_data.value());
+        symbol_id += num_symbols_parsed;
+        m_address_class_map.merge(address_class_map);
+      }
     }
   }
 
@@ -3828,6 +3842,32 @@ ObjectFileELF::MapFileDataWritable(const FileSpec &file, uint64_t Size,
                                                          Offset);
 }
 
+std::optional<DataExtractor>
+ObjectFileELF::ReadDataFromDynamic(const ELFDynamic *dyn, uint64_t length,
+                                   uint64_t offset) {
+  // ELFDynamic values contain a "d_ptr" member that will be a load address if
+  // we have an ELF file read from memory, or it will be a file address if it
+  // was read from a ELF file. This function will correctly fetch data pointed
+  // to by the ELFDynamic::d_ptr, or return std::nullopt if the data isn't
+  // available.
+  const lldb::addr_t d_ptr_addr = dyn->d_ptr + offset;
+  if (ProcessSP process_sp = m_process_wp.lock()) {
+    if (DataBufferSP data_sp = ReadMemory(process_sp, d_ptr_addr, length))
+      return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize());
+  } else {
+    // We have an ELF file with no section headers or we didn't find the
+    // .dynamic section. Try and find the .dynstr section.
+    Address addr;
+    if (!addr.ResolveAddressUsingFileSections(d_ptr_addr, GetSectionList()))
+      return std::nullopt;
+    DataExtractor data;
+    addr.GetSection()->GetSectionData(data);
+    return DataExtractor(data, d_ptr_addr - addr.GetSection()->GetFileAddress(),
+                         length);
+  }
+  return std::nullopt;
+}
+
 std::optional<DataExtractor> ObjectFileELF::GetDynstrData() {
   if (SectionList *section_list = GetSectionList()) {
     // Find the SHT_DYNAMIC section.
@@ -3855,31 +3895,15 @@ std::optional<DataExtractor> ObjectFileELF::GetDynstrData() {
   // and represent the dynamic symbol tables's string table. These are needed
   // by the dynamic loader and we can read them from a process' address space.
   //
-  // When loading and ELF file from memory, only the program headers end up
-  // being mapped into memory, and we can find these values in the PT_DYNAMIC
-  // segment.
+  // When loading and ELF file from memory, only the program headers are
+  // guaranteed end up being mapped into memory, and we can find these values in
+  // the PT_DYNAMIC segment.
   const ELFDynamic *strtab = FindDynamicSymbol(DT_STRTAB);
   const ELFDynamic *strsz = FindDynamicSymbol(DT_STRSZ);
   if (strtab == nullptr || strsz == nullptr)
     return std::nullopt;
 
-  if (ProcessSP process_sp = m_process_wp.lock()) {
-    if (DataBufferSP data_sp =
-            ReadMemory(process_sp, strtab->d_ptr, strsz->d_val))
-      return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize());
-  } else {
-    // We have an ELF file with no section headers or we didn't find the
-    // .dynamic section. Try and find the .dynstr section.
-    Address addr;
-    if (addr.ResolveAddressUsingFileSections(strtab->d_ptr, GetSectionList())) {
-      DataExtractor data;
-      addr.GetSection()->GetSectionData(data);
-      return DataExtractor(data,
-                           strtab->d_ptr - addr.GetSection()->GetFileAddress(),
-                           strsz->d_val);
-    }
-  }
-  return std::nullopt;
+  return ReadDataFromDynamic(strtab, strsz->d_val, /*offset=*/0);
 }
 
 std::optional<lldb_private::DataExtractor> ObjectFileELF::GetDynamicData() {
@@ -3912,3 +3936,119 @@ std::optional<lldb_private::DataExtractor> ObjectFileELF::GetDynamicData() {
   }
   return std::nullopt;
 }
+
+std::optional<uint32_t> ObjectFileELF::GetNumSymbolsFromDynamicHash() {
+  const ELFDynamic *hash = FindDynamicSymbol(DT_HASH);
+  if (hash == nullptr)
+    return std::nullopt;
+
+  // The DT_HASH header looks like this:
+  struct DtHashHeader {
+    uint32_t nbucket;
+    uint32_t nchain;
+  };
+  if (auto data = ReadDataFromDynamic(hash, 8)) {
+    // We don't need the number of buckets value "nbucket", we just need the
+    // "nchain" value which contains the number of symbols.
+    offset_t offset = offsetof(DtHashHeader, nchain);
+    return data->GetU32(&offset);
+  }
+
+  return std::nullopt;
+}
+
+std::optional<uint32_t> ObjectFileELF::GetNumSymbolsFromDynamicGnuHash() {
+  const ELFDynamic *gnu_hash = FindDynamicSymbol(DT_GNU_HASH);
+  if (gnu_hash == nullptr)
+    return std::nullopt;
+
+  // Create a DT_GNU_HASH header
+  // https://flapenguin.me/elf-dt-gnu-hash
+  struct DtGnuHashHeader {
+    uint32_t nbuckets = 0;
+    uint32_t symoffset = 0;
+    uint32_t bloom_size = 0;
+    uint32_t bloom_shift = 0;
+  };
+  uint32_t num_symbols = 0;
+  // Read enogh data for the DT_GNU_HASH header so we can extract the values.
+  if (auto data = ReadDataFromDynamic(gnu_hash, sizeof(DtGnuHashHeader))) {
+    offset_t offset = 0;
+    DtGnuHashHeader header;
+    header.nbuckets = data->GetU32(&offset);
+    header.symoffset = data->GetU32(&offset);
+    header.bloom_size = data->GetU32(&offset);
+    header.bloom_shift = data->GetU32(&offset);
+    const size_t addr_size = GetAddressByteSize();
+    const addr_t buckets_offset =
+        sizeof(DtGnuHashHeader) + addr_size * header.bloom_size;
+    std::vector<uint32_t> buckets;
+    if (auto bucket_data = ReadDataFromDynamic(gnu_hash, header.nbuckets * 4,
+                                               buckets_offset)) {
+      offset = 0;
+      for (uint32_t i = 0; i < header.nbuckets; ++i)
+        buckets.push_back(bucket_data->GetU32(&offset));
+      // Locate the chain that handles the largest index bucket.
+      uint32_t last_symbol = 0;
+      for (uint32_t bucket_value : buckets)
+        last_symbol = std::max(bucket_value, last_symbol);
+      if (last_symbol < header.symoffset) {
+        num_symbols = header.symoffset;
+      } else {
+        // Walk the bucket's chain to add the chain length to the total.
+        const addr_t chains_base_offset = buckets_offset + header.nbuckets * 4;
+        for (;;) {
+          if (auto chain_entry_data = ReadDataFromDynamic(
+                  gnu_hash, 4,
+                  chains_base_offset + (last_symbol - header.symoffset) * 4)) {
+            offset = 0;
+            uint32_t chain_entry = chain_entry_data->GetU32(&offset);
+            ++last_symbol;
+            // If the low bit is set, this entry is the end of the chain.
+            if (chain_entry & 1)
+              break;
+          } else {
+            break;
+          }
+        }
+        num_symbols = last_symbol;
+      }
+    }
+  }
+  if (num_symbols > 0)
+    return num_symbols;
+
+  return std::nullopt;
+}
+
+std::optional<DataExtractor>
+ObjectFileELF::GetDynsymDataFromDynamic(uint32_t &num_symbols) {
+  // Every ELF file which represents an executable or shared library has
+  // mandatory .dynamic entries. The DT_SYMTAB value contains a pointer to the
+  // symbol table, and DT_SYMENT contains the size of a symbol table entry.
+  // We then can use either the DT_HASH or DT_GNU_HASH to find the number of
+  // symbols in the symbol table as the symbol count is not stored in the
+  // .dynamic section as a key/value pair.
+  //
+  // When loading and ELF file from memory, only the program headers end up
+  // being mapped into memory, and we can find these values in the PT_DYNAMIC
+  // segment.
+  num_symbols = 0;
+  // Get the process in case this is an in memory ELF file.
+  ProcessSP process_sp(m_process_wp.lock());
+  const ELFDynamic *symtab = FindDynamicSymbol(DT_SYMTAB);
+  const ELFDynamic *syment = FindDynamicSymbol(DT_SYMENT);
+  // DT_SYMTAB and DT_SYMENT are mandatory.
+  if (symtab == nullptr || syment == nullptr)
+    return std::nullopt;
+
+  if (std::optional<uint32_t> syms = GetNumSymbolsFromDynamicHash())
+    num_symbols = *syms;
+  else if (std::optional<uint32_t> syms = GetNumSymbolsFromDynamicGnuHash())
+    num_symbols = *syms;
+  else
+    return std::nullopt;
+  if (num_symbols == 0)
+    return std::nullopt;
+  return ReadDataFromDynamic(symtab, syment->d_val * num_symbols);
+}
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
index aba3a5bfcbf5b..41b8ce189e41d 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
@@ -435,6 +435,46 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   /// \return The bytes that represent the string table data or \c std::nullopt
   ///         if an error occured.
   std::optional<lldb_private::DataExtractor> GetDynstrData();
+
+  /// Read the bytes pointed to by the \a dyn dynamic entry.
+  ///
+  /// ELFDynamic::d_ptr values contain file addresses if we load the ELF file
+  /// form a file on disk, or they contain load addresses if they were read
+  /// from memory. This function will correctly extract the data in both cases
+  /// if it is available.
+  ///
+  /// \param[in] dyn The dynamic entry to use to fetch the data from.
+  ///
+  /// \param[in] length The number of bytes to read.
+  ///
+  /// \param[in] offset The number of bytes to skip after the d_ptr value
+  ///                   before reading data.
+  ///
+  /// \return The bytes that represent the dynanic entries data or
+  ///         \c std::nullopt if an error occured or the data is not available.
+  std::optional<lldb_private::DataExtractor>
+  ReadDataFromDynamic(const elf::ELFDynamic *dyn, uint64_t length,
+                      uint64_t offset = 0);
+
+  /// Get the bytes that represent the dynamic symbol table from the .dynamic
+  /// section from process memory.
+  ///
+  /// This functon uses the DT_SYMTAB value from the .dynamic section to read
+  /// the symbols table data from process memory. The number of symbols in the
+  /// symbol table is calculated by looking at the DT_HASH or DT_GNU_HASH
+  /// values as the symbol count isn't stored in the .dynamic section.
+  ///
+  /// \return The bytes that represent the symbol table data from the .dynamic
+  ///         section or section headers or \c std::nullopt if an error
+  ///         occured or if there is no dynamic symbol data available.
+  std::optional<lldb_private::DataExtractor>
+  GetDynsymDataFromDynamic(uint32_t &num_symbols);
+
+  /// Get the number of symbols from the DT_HASH dynamic entry.
+  std::optional<uint32_t> GetNumSymbolsFromDynamicHash();
+
+  /// Get the number of symbols from the DT_GNU_HASH dynamic entry.
+  std::optional<uint32_t> GetNumSymbolsFromDynamicGnuHash();
 };
 
 #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_ELF_OBJECTFILEELF_H
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index d9bdeb560e122..37c1132c1c9f9 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -45,6 +45,7 @@
 #include "clang/AST/Type.h"
 #include "clang/Basic/Specifiers.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h"
 #include "llvm/Demangle/Demangle.h"
 
 #include <map>
@@ -826,11 +827,11 @@ std::string DWARFASTParserClang::GetDIEClassTemplateParams(DWARFDIE die) {
   if (llvm::StringRef(die.GetName()).contains("<"))
     return {};
 
-  TypeSystemClang::TemplateParameterInfos template_param_infos;
-  if (ParseTemplateParameterInfos(die, template_param_infos))
-    return m_ast.PrintTemplateParams(template_param_infos);
-
-  return {};
+  std::string name;
+  llvm::raw_string_ostream os(name);
+  llvm::DWARFTypePrinter<DWARFDIE> type_printer(os);
+  type_printer.appendAndTerminateTemplateParameters(die);
+  return name;
 }
 
 void DWARFASTParserClang::MapDeclDIEToDefDIE(
@@ -1618,9 +1619,9 @@ void DWARFASTParserClang::GetUniqueTypeNameAndDeclaration(
     case DW_TAG_structure_type:
     case DW_TAG_union_type: {
       if (const char *class_union_struct_name = parent_decl_ctx_die.GetName()) {
-        qualified_name.insert(
-            0, GetDIEClassTemplateParams(parent_decl_ctx_die));
         qualified_name.insert(0, "::");
+        qualified_name.insert(0,
+                              GetDIEClassTemplateParams(parent_decl_ctx_die));
         qualified_name.insert(0, class_union_struct_name);
       }
       parent_decl_ctx_die = parent_decl_ctx_die.GetParentDeclContextDIE();
@@ -1673,6 +1674,12 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
   if (attrs.name) {
     GetUniqueTypeNameAndDeclaration(die, cu_language, unique_typename,
                                     unique_decl);
+    if (log) {
+      dwarf->GetObjectFile()->GetModule()->LogMessage(
+          log, "SymbolFileDWARF({0:p}) - {1:x16}: {2} has unique name: {3} ",
+          static_cast<void *>(this), die.GetID(), DW_TAG_value_to_name(tag),
+          unique_typename.AsCString());
+    }
     if (UniqueDWARFASTType *unique_ast_entry_type =
             dwarf->GetUniqueDWARFASTTypeMap().Find(
                 unique_typename, die, unique_decl, byte_size,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h
index 235343d227122..d92de658a49e8 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h
@@ -24,9 +24,11 @@ class DWARFUnit;
 class DWARFDebugInfoEntry;
 class DWARFDeclContext;
 class SymbolFileDWARF;
+class DWARFFormValue;
 
 class DWARFBaseDIE {
 public:
+  using DWARFFormValue = dwarf::DWARFFormValue;
   DWARFBaseDIE() = default;
 
   DWARFBaseDIE(DWARFUnit *cu, DWARFDebugInfoEntry *die)
@@ -117,6 +119,12 @@ class DWARFBaseDIE {
   enum class Recurse : bool { no, yes };
   DWARFAttributes GetAttributes(Recurse recurse = Recurse::yes) const;
 
+  // The following methods use LLVM naming convension in order to be are used by
+  // LLVM libraries.
+  dw_tag_t getTag() const { return Tag(); }
+
+  const char *getShortName() const { return GetName(); }
+
 protected:
   DWARFUnit *m_cu = nullptr;
   DWARFDebugInfoEntry *m_die = nullptr;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index 4c9f1d8505f6e..362f4c44240c7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -572,6 +572,43 @@ bool DWARFDIE::GetDIENamesAndRanges(
     return false;
 }
 
+// The following methods use LLVM naming convension in order to be are used by
+// LLVM libraries.
 llvm::iterator_range<DWARFDIE::child_iterator> DWARFDIE::children() const {
   return llvm::make_range(child_iterator(*this), child_iterator());
 }
+
+DWARFDIE::child_iterator DWARFDIE::begin() const {
+  return child_iterator(*this);
+}
+
+DWARFDIE::child_iterator DWARFDIE::end() const { return child_iterator(); }
+
+std::optional<DWARFFormValue> DWARFDIE::find(const dw_attr_t attr) const {
+  DWARFFormValue form_value;
+  if (m_die->GetAttributeValue(m_cu, attr, form_value, nullptr, false))
+    return form_value;
+  return std::nullopt;
+}
+
+std::optional<uint64_t> DWARFDIE::getLanguage() const {
+  if (IsValid())
+    return m_cu->GetDWARFLanguageType();
+  return std::nullopt;
+}
+
+DWARFDIE DWARFDIE::resolveReferencedType(dw_attr_t attr) const {
+  return GetReferencedDIE(attr);
+}
+
+DWARFDIE DWARFDIE::resolveReferencedType(DWARFFormValue v) const {
+  if (IsValid())
+    return v.Reference();
+  return {};
+}
+
+DWARFDIE DWARFDIE::resolveTypeUnitReference() const {
+  if (DWARFDIE reference = GetReferencedDIE(DW_AT_signature))
+    return reference;
+  return *this;
+}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
index 077b78eb26d0c..5c1d381930c4e 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h
@@ -103,8 +103,25 @@ class DWARFDIE : public DWARFBaseDIE {
       std::optional<int> &call_line, std::optional<int> &call_column,
       DWARFExpressionList *frame_base) const;
 
+  // The following methods use LLVM naming convension in order to be are used by
+  // LLVM libraries.
+  std::optional<uint64_t> getLanguage() const;
+
+  DWARFDIE getParent() const { return GetParent(); }
+
+  DWARFDIE resolveReferencedType(dw_attr_t attr) const;
+
+  DWARFDIE resolveReferencedType(DWARFFormValue v) const;
+
+  DWARFDIE resolveTypeUnitReference() const;
+
+  std::optional<DWARFFormValue> find(const dw_attr_t attr) const;
+
   /// The range of all the children of this DIE.
   llvm::iterator_range<child_iterator> children() const;
+
+  child_iterator begin() const;
+  child_iterator end() const;
 };
 
 class DWARFDIE::child_iterator
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
index 404e50d57a925..fd3d45cef4c5e 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
@@ -574,6 +574,31 @@ uint64_t DWARFFormValue::Reference(dw_offset_t base_offset) const {
   }
 }
 
+std::optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
+  if ((!IsDataForm(m_form)) || m_form == lldb_private::dwarf::DW_FORM_sdata)
+    return std::nullopt;
+  return m_value.uval;
+}
+
+std::optional<int64_t> DWARFFormValue::getAsSignedConstant() const {
+  if ((!IsDataForm(m_form)) ||
+      (m_form == lldb_private::dwarf::DW_FORM_udata &&
+       uint64_t(std::numeric_limits<int64_t>::max()) < m_value.uval))
+    return std::nullopt;
+  switch (m_form) {
+  case lldb_private::dwarf::DW_FORM_data4:
+    return int32_t(m_value.uval);
+  case lldb_private::dwarf::DW_FORM_data2:
+    return int16_t(m_value.uval);
+  case lldb_private::dwarf::DW_FORM_data1:
+    return int8_t(m_value.uval);
+  case lldb_private::dwarf::DW_FORM_sdata:
+  case lldb_private::dwarf::DW_FORM_data8:
+  default:
+    return m_value.sval;
+  }
+}
+
 const uint8_t *DWARFFormValue::BlockData() const { return m_value.data; }
 
 bool DWARFFormValue::IsBlockForm(const dw_form_t form) {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h
index 8ab9163e645fe..613948f2f3c9b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h
@@ -76,6 +76,12 @@ class DWARFFormValue {
   void Clear();
   static bool FormIsSupported(dw_form_t form);
 
+  // The following methods use LLVM naming convension in order to be are used by
+  // LLVM libraries.
+  std::optional<uint64_t> getAsUnsignedConstant() const;
+  std::optional<int64_t> getAsSignedConstant() const;
+  const char *getAsCString() const { return AsCString(); }
+
 protected:
   // Compile unit where m_value was located.
   // It may be different from compile unit where m_value refers to.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 8ce0db4588a46..47050d86409a6 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -9,6 +9,7 @@
 #include "SymbolFileDWARF.h"
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Format.h"
@@ -2810,33 +2811,14 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
             return true; // Keep iterating over index types, language mismatch.
         }
 
-        // Check the context matches
-        std::vector<lldb_private::CompilerContext> die_context;
-        if (query.GetModuleSearch())
-          die_context = die.GetDeclContext();
-        else
-          die_context = die.GetTypeLookupContext();
-        assert(!die_context.empty());
-        if (!query_simple.ContextMatches(die_context))
-          return true; // Keep iterating over index types, context mismatch.
-
-        // Try to resolve the type.
-        if (Type *matching_type = ResolveType(die, true, true)) {
-          ConstString name = matching_type->GetQualifiedName();
-          // We have found a type that still might not match due to template
-          // parameters. If we create a new TypeQuery that uses the new type's
-          // fully qualified name, we can find out if this type matches at all
-          // context levels. We can't use just the "match_simple" context
-          // because all template parameters were stripped off. The fully
-          // qualified name of the type will have the template parameters and
-          // will allow us to make sure it matches correctly.
-          TypeQuery die_query(name.GetStringRef(),
-                              TypeQueryOptions::e_exact_match);
-          if (!query.ContextMatches(die_query.GetContextRef()))
-            return true; // Keep iterating over index types, context mismatch.
-
-          results.InsertUnique(matching_type->shared_from_this());
-        }
+        std::string qualified_name;
+        llvm::raw_string_ostream os(qualified_name);
+        llvm::DWARFTypePrinter<DWARFDIE> type_printer(os);
+        type_printer.appendQualifiedName(die);
+        TypeQuery die_query(qualified_name, e_exact_match);
+        if (query.ContextMatches(die_query.GetContextRef()))
+          if (Type *matching_type = ResolveType(die, true, true))
+            results.InsertUnique(matching_type->shared_from_this());
         return !results.Done(query); // Keep iterating if we aren't done.
       });
       if (results.Done(query)) {
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 1a77c7cf9161a..5f8163211857c 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -1403,26 +1403,6 @@ static TemplateParameterList *CreateTemplateParameterList(
   return template_param_list;
 }
 
-std::string TypeSystemClang::PrintTemplateParams(
-    const TemplateParameterInfos &template_param_infos) {
-  llvm::SmallVector<NamedDecl *, 8> ignore;
-  clang::TemplateParameterList *template_param_list =
-      CreateTemplateParameterList(getASTContext(), template_param_infos,
-                                  ignore);
-  llvm::SmallVector<clang::TemplateArgument, 2> args(
-      template_param_infos.GetArgs());
-  if (template_param_infos.hasParameterPack()) {
-    llvm::ArrayRef<TemplateArgument> pack_args =
-        template_param_infos.GetParameterPackArgs();
-    args.append(pack_args.begin(), pack_args.end());
-  }
-  std::string str;
-  llvm::raw_string_ostream os(str);
-  clang::printTemplateArgumentList(os, args, GetTypePrintingPolicy(),
-                                   template_param_list);
-  return str;
-}
-
 clang::FunctionTemplateDecl *TypeSystemClang::CreateFunctionTemplateDecl(
     clang::DeclContext *decl_ctx, OptionalClangModuleID owning_module,
     clang::FunctionDecl *func_decl,
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index e39aedec7e390..678eaed381fd4 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -1148,10 +1148,6 @@ class TypeSystemClang : public TypeSystem {
 
   bool SetDeclIsForcefullyCompleted(const clang::TagDecl *td);
 
-  /// Return the template parameters (including surrounding <>) in string form.
-  std::string
-  PrintTemplateParams(const TemplateParameterInfos &template_param_infos);
-
 private:
   /// Returns the PrintingPolicy used when generating the internal type names.
   /// These type names are mostly used for the formatter selection.
diff --git a/lldb/source/Target/LanguageRuntime.cpp b/lldb/source/Target/LanguageRuntime.cpp
index ce3646c8b05c8..269d1e017fdf2 100644
--- a/lldb/source/Target/LanguageRuntime.cpp
+++ b/lldb/source/Target/LanguageRuntime.cpp
@@ -277,6 +277,11 @@ LanguageRuntime::GetRuntimeUnwindPlan(Thread &thread, RegisterContext *regctx,
   return UnwindPlanSP();
 }
 
+StructuredData::ObjectSP
+LanguageRuntime::GetLanguageSpecificData(SymbolContext sc) {
+  return {};
+}
+
 void LanguageRuntime::InitializeCommands(CommandObject *parent) {
   if (!parent)
     return;
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index 9a4a8db84a9fa..dbe885e286ff7 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -1402,7 +1402,7 @@ RegisterContextUnwind::SavedLocationForRegister(
       // it's still live in the actual register. Handle this specially.
 
       if (!have_unwindplan_regloc && return_address_reg.IsValid() &&
-          BehavesLikeZerothFrame()) {
+          IsFrameZero()) {
         if (return_address_reg.GetAsKind(eRegisterKindLLDB) !=
             LLDB_INVALID_REGNUM) {
           lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc;
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 1bca9786fb7c7..dfbac5a572d00 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -22,6 +22,7 @@
 #include "lldb/Symbol/VariableList.h"
 #include "lldb/Target/ABI.h"
 #include "lldb/Target/ExecutionContext.h"
+#include "lldb/Target/LanguageRuntime.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/RegisterContext.h"
 #include "lldb/Target/StackFrameRecognizer.h"
@@ -1230,6 +1231,18 @@ bool StackFrame::IsHidden() {
   return false;
 }
 
+StructuredData::ObjectSP StackFrame::GetLanguageSpecificData() {
+  auto process_sp = CalculateProcess();
+  SourceLanguage language = GetLanguage();
+  if (!language)
+    return {};
+  if (auto runtime_sp =
+          process_sp->GetLanguageRuntime(language.AsLanguageType()))
+    return runtime_sp->GetLanguageSpecificData(
+        GetSymbolContext(eSymbolContextFunction));
+  return {};
+}
+
 const char *StackFrame::GetFunctionName() {
   const char *name = nullptr;
   SymbolContext sc = GetSymbolContext(
@@ -2007,19 +2020,9 @@ bool StackFrame::GetStatus(Stream &strm, bool show_frame_info, bool show_source,
           if (num_lines != 0)
             have_source = true;
           // TODO: Give here a one time warning if source file is missing.
-          if (!m_sc.line_entry.line) {
-            ConstString fn_name = m_sc.GetFunctionName();
-
-            if (!fn_name.IsEmpty())
-              strm.Printf(
-                  "Note: this address is compiler-generated code in function "
-                  "%s that has no source code associated with it.",
-                  fn_name.AsCString());
-            else
-              strm.Printf("Note: this address is compiler-generated code that "
-                          "has no source code associated with it.");
-            strm.EOL();
-          }
+          if (!m_sc.line_entry.line)
+            strm << "note: This address is not associated with a specific line "
+                    "of code. This may be due to compiler optimizations.\n";
         }
       }
       switch (disasm_display) {
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 735295e6f2593..fb17276051909 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -244,7 +244,7 @@ Thread::~Thread() {
   LLDB_LOGF(log, "%p Thread::~Thread(tid = 0x%4.4" PRIx64 ")",
             static_cast<void *>(this), GetID());
   /// If you hit this assert, it means your derived class forgot to call
-  /// DoDestroy in its destructor.
+  /// DestroyThread in its destructor.
   assert(m_destroy_called);
 }
 
diff --git a/lldb/source/Target/ThreadPlanStack.cpp b/lldb/source/Target/ThreadPlanStack.cpp
index 1572931429071..d5d600dda47a3 100644
--- a/lldb/source/Target/ThreadPlanStack.cpp
+++ b/lldb/source/Target/ThreadPlanStack.cpp
@@ -39,21 +39,21 @@ ThreadPlanStack::ThreadPlanStack(const Thread &thread, bool make_null) {
 void ThreadPlanStack::DumpThreadPlans(Stream &s,
                                       lldb::DescriptionLevel desc_level,
                                       bool include_internal) const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   s.IndentMore();
-  PrintOneStack(s, "Active plan stack", m_plans, desc_level, include_internal);
-  PrintOneStack(s, "Completed plan stack", m_completed_plans, desc_level,
-                include_internal);
-  PrintOneStack(s, "Discarded plan stack", m_discarded_plans, desc_level,
-                include_internal);
+  PrintOneStackNoLock(s, "Active plan stack", m_plans, desc_level,
+                      include_internal);
+  PrintOneStackNoLock(s, "Completed plan stack", m_completed_plans, desc_level,
+                      include_internal);
+  PrintOneStackNoLock(s, "Discarded plan stack", m_discarded_plans, desc_level,
+                      include_internal);
   s.IndentLess();
 }
 
-void ThreadPlanStack::PrintOneStack(Stream &s, llvm::StringRef stack_name,
-                                    const PlanStack &stack,
-                                    lldb::DescriptionLevel desc_level,
-                                    bool include_internal) const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+void ThreadPlanStack::PrintOneStackNoLock(Stream &s, llvm::StringRef stack_name,
+                                          const PlanStack &stack,
+                                          lldb::DescriptionLevel desc_level,
+                                          bool include_internal) const {
   // If the stack is empty, just exit:
   if (stack.empty())
     return;
@@ -82,7 +82,7 @@ void ThreadPlanStack::PrintOneStack(Stream &s, llvm::StringRef stack_name,
 }
 
 size_t ThreadPlanStack::CheckpointCompletedPlans() {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   m_completed_plan_checkpoint++;
   m_completed_plan_store.insert(
       std::make_pair(m_completed_plan_checkpoint, m_completed_plans));
@@ -90,7 +90,7 @@ size_t ThreadPlanStack::CheckpointCompletedPlans() {
 }
 
 void ThreadPlanStack::RestoreCompletedPlanCheckpoint(size_t checkpoint) {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   auto result = m_completed_plan_store.find(checkpoint);
   assert(result != m_completed_plan_store.end() &&
          "Asked for a checkpoint that didn't exist");
@@ -99,13 +99,13 @@ void ThreadPlanStack::RestoreCompletedPlanCheckpoint(size_t checkpoint) {
 }
 
 void ThreadPlanStack::DiscardCompletedPlanCheckpoint(size_t checkpoint) {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   m_completed_plan_store.erase(checkpoint);
 }
 
 void ThreadPlanStack::ThreadDestroyed(Thread *thread) {
   // Tell the plan stacks that this thread is going away:
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   for (ThreadPlanSP plan : m_plans)
     plan->ThreadDestroyed();
 
@@ -134,20 +134,22 @@ void ThreadPlanStack::PushPlan(lldb::ThreadPlanSP new_plan_sp) {
   // If the thread plan doesn't already have a tracer, give it its parent's
   // tracer:
   // The first plan has to be a base plan:
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
-  assert((m_plans.size() > 0 || new_plan_sp->IsBasePlan()) &&
-         "Zeroth plan must be a base plan");
-
-  if (!new_plan_sp->GetThreadPlanTracer()) {
-    assert(!m_plans.empty());
-    new_plan_sp->SetThreadPlanTracer(m_plans.back()->GetThreadPlanTracer());
+  { // Scope for Lock - DidPush often adds plans to the stack:
+    llvm::sys::ScopedWriter guard(m_stack_mutex);
+    assert((m_plans.size() > 0 || new_plan_sp->IsBasePlan()) &&
+           "Zeroth plan must be a base plan");
+
+    if (!new_plan_sp->GetThreadPlanTracer()) {
+      assert(!m_plans.empty());
+      new_plan_sp->SetThreadPlanTracer(m_plans.back()->GetThreadPlanTracer());
+    }
+    m_plans.push_back(new_plan_sp);
   }
-  m_plans.push_back(new_plan_sp);
   new_plan_sp->DidPush();
 }
 
 lldb::ThreadPlanSP ThreadPlanStack::PopPlan() {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   assert(m_plans.size() > 1 && "Can't pop the base thread plan");
 
   // Note that moving the top element of the vector would leave it in an
@@ -161,7 +163,11 @@ lldb::ThreadPlanSP ThreadPlanStack::PopPlan() {
 }
 
 lldb::ThreadPlanSP ThreadPlanStack::DiscardPlan() {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
+  return DiscardPlanNoLock();
+}
+
+lldb::ThreadPlanSP ThreadPlanStack::DiscardPlanNoLock() {
   assert(m_plans.size() > 1 && "Can't discard the base thread plan");
 
   // Note that moving the top element of the vector would leave it in an
@@ -177,12 +183,12 @@ lldb::ThreadPlanSP ThreadPlanStack::DiscardPlan() {
 // If the input plan is nullptr, discard all plans.  Otherwise make sure this
 // plan is in the stack, and if so discard up to and including it.
 void ThreadPlanStack::DiscardPlansUpToPlan(ThreadPlan *up_to_plan_ptr) {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   int stack_size = m_plans.size();
 
   if (up_to_plan_ptr == nullptr) {
     for (int i = stack_size - 1; i > 0; i--)
-      DiscardPlan();
+      DiscardPlanNoLock();
     return;
   }
 
@@ -197,23 +203,23 @@ void ThreadPlanStack::DiscardPlansUpToPlan(ThreadPlan *up_to_plan_ptr) {
   if (found_it) {
     bool last_one = false;
     for (int i = stack_size - 1; i > 0 && !last_one; i--) {
-      if (GetCurrentPlan().get() == up_to_plan_ptr)
+      if (GetCurrentPlanNoLock().get() == up_to_plan_ptr)
         last_one = true;
-      DiscardPlan();
+      DiscardPlanNoLock();
     }
   }
 }
 
 void ThreadPlanStack::DiscardAllPlans() {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   int stack_size = m_plans.size();
   for (int i = stack_size - 1; i > 0; i--) {
-    DiscardPlan();
+    DiscardPlanNoLock();
   }
 }
 
 void ThreadPlanStack::DiscardConsultingControllingPlans() {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   while (true) {
     int controlling_plan_idx;
     bool discard = true;
@@ -234,26 +240,30 @@ void ThreadPlanStack::DiscardConsultingControllingPlans() {
 
     // First pop all the dependent plans:
     for (int i = m_plans.size() - 1; i > controlling_plan_idx; i--) {
-      DiscardPlan();
+      DiscardPlanNoLock();
     }
 
     // Now discard the controlling plan itself.
     // The bottom-most plan never gets discarded.  "OkayToDiscard" for it
     // means discard it's dependent plans, but not it...
     if (controlling_plan_idx > 0) {
-      DiscardPlan();
+      DiscardPlanNoLock();
     }
   }
 }
 
 lldb::ThreadPlanSP ThreadPlanStack::GetCurrentPlan() const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
+  return GetCurrentPlanNoLock();
+}
+
+lldb::ThreadPlanSP ThreadPlanStack::GetCurrentPlanNoLock() const {
   assert(m_plans.size() != 0 && "There will always be a base plan.");
   return m_plans.back();
 }
 
 lldb::ThreadPlanSP ThreadPlanStack::GetCompletedPlan(bool skip_private) const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   if (m_completed_plans.empty())
     return {};
 
@@ -271,7 +281,7 @@ lldb::ThreadPlanSP ThreadPlanStack::GetCompletedPlan(bool skip_private) const {
 
 lldb::ThreadPlanSP ThreadPlanStack::GetPlanByIndex(uint32_t plan_idx,
                                                    bool skip_private) const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   uint32_t idx = 0;
 
   for (lldb::ThreadPlanSP plan_sp : m_plans) {
@@ -285,7 +295,7 @@ lldb::ThreadPlanSP ThreadPlanStack::GetPlanByIndex(uint32_t plan_idx,
 }
 
 lldb::ValueObjectSP ThreadPlanStack::GetReturnValueObject() const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   if (m_completed_plans.empty())
     return {};
 
@@ -299,7 +309,7 @@ lldb::ValueObjectSP ThreadPlanStack::GetReturnValueObject() const {
 }
 
 lldb::ExpressionVariableSP ThreadPlanStack::GetExpressionVariable() const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   if (m_completed_plans.empty())
     return {};
 
@@ -312,23 +322,23 @@ lldb::ExpressionVariableSP ThreadPlanStack::GetExpressionVariable() const {
   return {};
 }
 bool ThreadPlanStack::AnyPlans() const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   // There is always a base plan...
   return m_plans.size() > 1;
 }
 
 bool ThreadPlanStack::AnyCompletedPlans() const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   return !m_completed_plans.empty();
 }
 
 bool ThreadPlanStack::AnyDiscardedPlans() const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   return !m_discarded_plans.empty();
 }
 
 bool ThreadPlanStack::IsPlanDone(ThreadPlan *in_plan) const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   for (auto plan : m_completed_plans) {
     if (plan.get() == in_plan)
       return true;
@@ -337,7 +347,7 @@ bool ThreadPlanStack::IsPlanDone(ThreadPlan *in_plan) const {
 }
 
 bool ThreadPlanStack::WasPlanDiscarded(ThreadPlan *in_plan) const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   for (auto plan : m_discarded_plans) {
     if (plan.get() == in_plan)
       return true;
@@ -346,7 +356,7 @@ bool ThreadPlanStack::WasPlanDiscarded(ThreadPlan *in_plan) const {
 }
 
 ThreadPlan *ThreadPlanStack::GetPreviousPlan(ThreadPlan *current_plan) const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   if (current_plan == nullptr)
     return nullptr;
 
@@ -361,7 +371,7 @@ ThreadPlan *ThreadPlanStack::GetPreviousPlan(ThreadPlan *current_plan) const {
   // If this is the first completed plan, the previous one is the
   // bottom of the regular plan stack.
   if (stack_size > 0 && m_completed_plans[0].get() == current_plan) {
-    return GetCurrentPlan().get();
+    return GetCurrentPlanNoLock().get();
   }
 
   // Otherwise look for it in the regular plans.
@@ -374,7 +384,7 @@ ThreadPlan *ThreadPlanStack::GetPreviousPlan(ThreadPlan *current_plan) const {
 }
 
 ThreadPlan *ThreadPlanStack::GetInnermostExpression() const {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   int stack_size = m_plans.size();
 
   for (int i = stack_size - 1; i > 0; i--) {
@@ -385,13 +395,13 @@ ThreadPlan *ThreadPlanStack::GetInnermostExpression() const {
 }
 
 void ThreadPlanStack::ClearThreadCache() {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedReader guard(m_stack_mutex);
   for (lldb::ThreadPlanSP thread_plan_sp : m_plans)
     thread_plan_sp->ClearThreadCache();
 }
 
 void ThreadPlanStack::WillResume() {
-  std::lock_guard<std::recursive_mutex> guard(m_stack_mutex);
+  llvm::sys::ScopedWriter guard(m_stack_mutex);
   m_completed_plans.clear();
   m_discarded_plans.clear();
 }
diff --git a/lldb/source/Utility/DiagnosticsRendering.cpp b/lldb/source/Utility/DiagnosticsRendering.cpp
index 208733ffc8685..f5aa27baadfef 100644
--- a/lldb/source/Utility/DiagnosticsRendering.cpp
+++ b/lldb/source/Utility/DiagnosticsRendering.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Utility/DiagnosticsRendering.h"
+#include <cstdint>
 
 using namespace lldb_private;
 using namespace lldb;
@@ -98,7 +99,7 @@ void RenderDiagnosticDetails(Stream &stream,
   }
 
   // Sort the diagnostics.
-  auto sort = [](auto &ds) {
+  auto sort = [](std::vector<DiagnosticDetail> &ds) {
     std::stable_sort(ds.begin(), ds.end(), [](auto &d1, auto &d2) {
       auto l1 = d1.source_location.value_or(DiagnosticDetail::SourceLocation{});
       auto l2 = d2.source_location.value_or(DiagnosticDetail::SourceLocation{});
@@ -121,15 +122,27 @@ void RenderDiagnosticDetails(Stream &stream,
         continue;
 
       stream << std::string(loc.column - x_pos, ' ') << cursor;
-      ++x_pos;
+      x_pos = loc.column + 1;
       for (unsigned i = 0; i + 1 < loc.length; ++i) {
         stream << underline;
-        ++x_pos;
+        x_pos += 1;
       }
     }
   }
   stream << '\n';
 
+  // Reverse the order within groups of diagnostics that are on the same column.
+  auto group = [](std::vector<DiagnosticDetail> &details) {
+    for (auto it = details.begin(), end = details.end(); it != end;) {
+      auto eq_end = std::find_if(it, end, [&](const DiagnosticDetail &d) {
+        return d.source_location->column != it->source_location->column;
+      });
+      std::reverse(it, eq_end);
+      it = eq_end;
+    }
+  };
+  group(remaining_details);
+
   // Work through each detail in reverse order using the vector/stack.
   bool did_print = false;
   for (auto detail = remaining_details.rbegin();
@@ -142,14 +155,19 @@ void RenderDiagnosticDetails(Stream &stream,
     for (auto &remaining_detail :
          llvm::ArrayRef(remaining_details).drop_back(1)) {
       uint16_t column = remaining_detail.source_location->column;
-      if (x_pos <= column)
+      // Is this a note with the same column as another diagnostic?
+      if (column == detail->source_location->column)
+        continue;
+
+      if (column >= x_pos) {
         stream << std::string(column - x_pos, ' ') << vbar;
-      x_pos = column + 1;
+        x_pos = column + 1;
+      }
     }
 
-    // Print the line connecting the ^ with the error message.
     uint16_t column = detail->source_location->column;
-    if (x_pos <= column)
+    // Print the line connecting the ^ with the error message.
+    if (column >= x_pos)
       stream << std::string(column - x_pos, ' ') << joint << hbar << spacer;
 
     // Print a colorized string based on the message's severity type.
diff --git a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
index 06f338b3ed1de..bcf8735c7c3f9 100644
--- a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
+++ b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
@@ -3,6 +3,7 @@
 """
 
 import os
+import re
 import lldb
 from lldbsuite.test.lldbtest import *
 import lldbsuite.test.lldbutil as lldbutil
@@ -30,15 +31,11 @@ def test_stdcxx_disasm(self):
                 self.runCmd("disassemble -n '%s'" % function.GetName())
 
         lib_stdcxx = "FAILHORRIBLYHERE"
-        # Iterate through the available modules, looking for stdc++ library...
-        for i in range(target.GetNumModules()):
-            module = target.GetModuleAtIndex(i)
-            fs = module.GetFileSpec()
-            if fs.GetFilename().startswith("libstdc++") or fs.GetFilename().startswith(
-                "libc++"
-            ):
-                lib_stdcxx = str(fs)
-                break
+        # Find the stdc++ library...
+        stdlib_regex = re.compile(r"/lib(std)?c\+\+")
+        for module in target.module[stdlib_regex]:
+            lib_stdcxx = module.file.fullpath
+            break
 
         # At this point, lib_stdcxx is the full path to the stdc++ library and
         # module is the corresponding SBModule.
diff --git a/lldb/test/API/lang/objc/languageinfo/Makefile b/lldb/test/API/lang/objc/languageinfo/Makefile
new file mode 100644
index 0000000000000..11fce1e5c5219
--- /dev/null
+++ b/lldb/test/API/lang/objc/languageinfo/Makefile
@@ -0,0 +1,4 @@
+OBJC_SOURCES := main.m
+LD_EXTRAS := -lobjc
+
+include Makefile.rules
diff --git a/lldb/test/API/lang/objc/languageinfo/TestObjCLanguageSpecificData.py b/lldb/test/API/lang/objc/languageinfo/TestObjCLanguageSpecificData.py
new file mode 100644
index 0000000000000..5558abed929da
--- /dev/null
+++ b/lldb/test/API/lang/objc/languageinfo/TestObjCLanguageSpecificData.py
@@ -0,0 +1,16 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class ObjCiVarIMPTestCase(TestBase):
+    @skipUnlessDarwin
+    @no_debug_info_test
+    def test_imp_ivar_type(self):
+        self.build()
+        target, process, thread, bkpt = lldbutil.run_to_name_breakpoint(self, "main")
+        frame = thread.GetFrameAtIndex(0)
+        lang_info = frame.GetLanguageSpecificData()
+        version = lang_info.GetValueForKey("Objective-C runtime version")
+        self.assertEqual(version.GetIntegerValue(), 2)
diff --git a/lldb/test/API/lang/objc/languageinfo/main.m b/lldb/test/API/lang/objc/languageinfo/main.m
new file mode 100644
index 0000000000000..06e216a2fde64
--- /dev/null
+++ b/lldb/test/API/lang/objc/languageinfo/main.m
@@ -0,0 +1 @@
+int main(int argc, char const *argv[]) { return 0; }
diff --git a/lldb/test/API/source-manager/TestSourceManager.py b/lldb/test/API/source-manager/TestSourceManager.py
index 7071f094e20f7..7d9ce86cdc353 100644
--- a/lldb/test/API/source-manager/TestSourceManager.py
+++ b/lldb/test/API/source-manager/TestSourceManager.py
@@ -336,8 +336,8 @@ def test_artificial_source_location(self):
                 "stop reason = breakpoint",
                 f"{src_file}:0",
                 "static int foo();",
-                "Note: this address is compiler-generated code in function",
-                "that has no source code associated with it.",
+                "note: This address is not associated with a specific line "
+                "of code. This may be due to compiler optimizations.",
             ],
         )
 
diff --git a/lldb/test/API/terminal/TestEditlineCompletions.py b/lldb/test/API/terminal/TestEditlineCompletions.py
new file mode 100644
index 0000000000000..7fa6f95c130c6
--- /dev/null
+++ b/lldb/test/API/terminal/TestEditlineCompletions.py
@@ -0,0 +1,64 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from lldbsuite.test.lldbpexpect import PExpectTest
+
+
+class EditlineCompletionsTest(PExpectTest):
+    @skipIfAsan
+    @skipIfEditlineSupportMissing
+    def test_completion_truncated(self):
+        """Test that the completion is correctly truncated."""
+        self.launch(dimensions=(10, 20))
+        self.child.send("_regexp-\t")
+        self.child.expect("        _regexp-a...")
+        self.child.expect("        _regexp-b...")
+
+    @skipIfAsan
+    @skipIfEditlineSupportMissing
+    def test_description_truncated(self):
+        """Test that the description is correctly truncated."""
+        self.launch(dimensions=(10, 70))
+        self.child.send("_regexp-\t")
+        self.child.expect(
+            "        _regexp-attach    -- Attach to process by ID or name."
+        )
+        self.child.expect(
+            "        _regexp-break     -- Set a breakpoint using one of several..."
+        )
+
+    @skipIfAsan
+    @skipIfEditlineSupportMissing
+    def test_separator_omitted(self):
+        """Test that the separated is correctly omitted."""
+        self.launch(dimensions=(10, 32))
+        self.child.send("_regexp-\t")
+        self.child.expect("        _regexp-attach   \r\n")
+        self.child.expect("        _regexp-break    \r\n")
+
+    @skipIfAsan
+    @skipIfEditlineSupportMissing
+    def test_separator(self):
+        """Test that the separated is correctly printed."""
+        self.launch(dimensions=(10, 33))
+        self.child.send("_regexp-\t")
+        self.child.expect("        _regexp-attach    -- A...")
+        self.child.expect("        _regexp-break     -- S...")
+
+    @skipIfAsan
+    @skipIfEditlineSupportMissing
+    def test_multiline_description(self):
+        """Test that multi-line descriptions are correctly padded and truncated."""
+        self.launch(dimensions=(10, 72))
+        self.child.send("k\t")
+        self.child.expect(
+            "        kdp-remote -- Connect to a process via remote KDP server."
+        )
+        self.child.expect(
+            "                      If no UDP port is specified, port 41139 is assu..."
+        )
+        self.child.expect(
+            "                      kdp-remote is an abbreviation for 'process conn..."
+        )
+        self.child.expect("        kill       -- Terminate the current target process.")
diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test b/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test
new file mode 100644
index 0000000000000..c5a6a9720fbec
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test
@@ -0,0 +1,45 @@
+// This test verifies that loading an ELF file that has no section headers can
+// load the dynamic symbol table using the DT_SYMTAB, DT_SYMENT, DT_HASH or
+// the DT_GNU_HASH .dynamic key/value pairs that are loaded via the PT_DYNAMIC
+// segment.
+
+// REQUIRES: x86-registered-Target
+// RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj \
+// RUN:   -o - - <<<".globl defined, undefined; defined:" | \
+// RUN:   ld.lld /dev/stdin -o - --hash-style=gnu -export-dynamic -shared \
+// RUN:   -o %t.gnu
+// RUN: llvm-strip --strip-sections %t.gnu
+// RUN: %lldb %t.gnu -b \
+// RUN:   -o "image dump objfile" \
+// RUN:   | FileCheck %s --dump-input=always --check-prefix=GNU
+// GNU: (lldb) image dump objfile
+// GNU: Dumping headers for 1 module(s).
+// GNU: ObjectFileELF, file =
+// GNU: ELF Header
+// GNU: e_type      = 0x0003 ET_DYN
+// Make sure there are no section headers
+// GNU: e_shnum = 0x00000000
+// Make sure we were able to load the symbols
+// GNU: Symtab, file = {{.*}}elf-dynsym.test.tmp.gnu, num_symbols = 2:
+// GNU-DAG: undefined
+// GNU-DAG: defined
+
+// RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj \
+// RUN:   -o - - <<<".globl defined, undefined; defined:" | \
+// RUN:   ld.lld /dev/stdin -o - --hash-style=sysv -export-dynamic -shared \
+// RUN:   -o %t.sysv
+// RUN: llvm-strip --strip-sections %t.sysv
+// RUN: %lldb %t.sysv -b \
+// RUN:   -o "image dump objfile" \
+// RUN:   | FileCheck %s --dump-input=always --check-prefix=HASH
+// HASH: (lldb) image dump objfile
+// HASH: Dumping headers for 1 module(s).
+// HASH: ObjectFileELF, file =
+// HASH: ELF Header
+// HASH: e_type      = 0x0003 ET_DYN
+// Make sure there are no section headers
+// HASH: e_shnum = 0x00000000
+// Make sure we were able to load the symbols
+// HASH: Symtab, file = {{.*}}elf-dynsym.test.tmp.sysv, num_symbols = 2:
+// HASH-DAG: undefined
+// HASH-DAG: defined
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp
new file mode 100644
index 0000000000000..328d6d2e16d59
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp
@@ -0,0 +1,36 @@
+// Test lldb is able to compute the fully qualified names on templates with
+// -gsimple-template-names and -fdebug-types-section.
+
+// REQUIRES: lld
+
+// Test against logging to see if we print the fully qualified names correctly.
+// RUN: %clangxx --target=x86_64-pc-linux -g -gsimple-template-names %s -c -o %t1.o
+// RUN: ld.lld %t1.o -o %t1
+// RUN: %lldb %t1 -o "log enable dwarf comp" -o "target variable v3" -o exit | FileCheck %s --check-prefix=LOG
+
+// Test that we following DW_AT_signature correctly. If not, lldb might confuse the types of v1 and v2.
+// RUN: %clangxx --target=x86_64-pc-linux -g -gsimple-template-names -fdebug-types-section %s -c -o %t2.o
+// RUN: ld.lld %t2.o -o %t2
+// RUN: %lldb %t2 -o "target variable v1 v2" -o exit | FileCheck %s --check-prefix=TYPE
+
+// LOG: unique name: t3<t2<int> >::t4
+
+// TYPE:      (t2<outer_struct1::t1<int> >) v1 = {}
+// TYPE-NEXT: (t2<outer_struct2::t1<int> >) v2 = {}
+
+struct outer_struct1 {
+  template <typename> struct t1 {};
+};
+
+struct outer_struct2 {
+  template <typename> struct t1 {};
+};
+
+template <typename> struct t2 {};
+t2<outer_struct1::t1<int>> v1;
+t2<outer_struct2::t1<int>> v2;
+
+template <typename> struct t3 {
+  struct t4 {};
+};
+t3<t2<int>>::t4 v3;
diff --git a/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test b/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test
index 94f1c011ebd2a..0356960424328 100644
--- a/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test
+++ b/lldb/test/Shell/Unwind/windows-unaligned-x86_64.test
@@ -17,7 +17,7 @@ breakpoint set -n func
 # CHECK: Breakpoint 1: where = {{.*}}`{{(::)?}}func
 
 process launch
-# CHECK: stop reason = breakpoint 1.1
+# CHECK: stop reason = breakpoint 1
 
 thread backtrace
 # CHECK: frame #0: {{.*}}`{{(::)?}}func
diff --git a/lldb/unittests/Host/MainLoopTest.cpp b/lldb/unittests/Host/MainLoopTest.cpp
index 622a547fa22f0..e7425b737a6da 100644
--- a/lldb/unittests/Host/MainLoopTest.cpp
+++ b/lldb/unittests/Host/MainLoopTest.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
+#include <chrono>
 #include <future>
 #include <thread>
 
@@ -106,13 +107,9 @@ TEST_F(MainLoopTest, NoSpuriousReads) {
       error);
   ASSERT_THAT_ERROR(error.ToError(), llvm::Succeeded());
   // Terminate the loop after one second.
-  std::thread terminate_thread([&loop] {
-    std::this_thread::sleep_for(std::chrono::seconds(1));
-    loop.AddPendingCallback(
-        [](MainLoopBase &loop) { loop.RequestTermination(); });
-  });
+  loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); },
+                   std::chrono::seconds(1));
   ASSERT_THAT_ERROR(loop.Run().ToError(), llvm::Succeeded());
-  terminate_thread.join();
 
   // Make sure the callback was called only once.
   ASSERT_EQ(1u, callback_count);
@@ -223,6 +220,61 @@ TEST_F(MainLoopTest, ManyPendingCallbacks) {
   ASSERT_TRUE(loop.Run().Success());
 }
 
+TEST_F(MainLoopTest, CallbackWithTimeout) {
+  MainLoop loop;
+  loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); },
+                   std::chrono::seconds(2));
+  auto start = std::chrono::steady_clock::now();
+  ASSERT_THAT_ERROR(loop.Run().takeError(), llvm::Succeeded());
+  EXPECT_GE(std::chrono::steady_clock::now() - start, std::chrono::seconds(2));
+}
+
+TEST_F(MainLoopTest, TimedCallbacksRunInOrder) {
+  MainLoop loop;
+  auto start = std::chrono::steady_clock::now();
+  std::chrono::milliseconds epsilon(10);
+  std::vector<int> order;
+  auto add_cb = [&](int id) {
+    loop.AddCallback([&order, id](MainLoopBase &) { order.push_back(id); },
+                     start + id * epsilon);
+  };
+  add_cb(3);
+  add_cb(2);
+  add_cb(4);
+  add_cb(1);
+  loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); },
+                   start + 5 * epsilon);
+  ASSERT_THAT_ERROR(loop.Run().takeError(), llvm::Succeeded());
+  EXPECT_GE(std::chrono::steady_clock::now() - start, 5 * epsilon);
+  ASSERT_THAT(order, testing::ElementsAre(1, 2, 3, 4));
+}
+
+TEST_F(MainLoopTest, TimedCallbackShortensSleep) {
+  MainLoop loop;
+  auto start = std::chrono::steady_clock::now();
+  bool long_callback_called = false;
+  loop.AddCallback(
+      [&](MainLoopBase &loop) {
+        long_callback_called = true;
+        loop.RequestTermination();
+      },
+      std::chrono::seconds(30));
+  std::future<Status> async_run =
+      std::async(std::launch::async, &MainLoop::Run, std::ref(loop));
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  bool short_callback_called = false;
+  loop.AddCallback(
+      [&](MainLoopBase &loop) {
+        short_callback_called = true;
+        loop.RequestTermination();
+      },
+      std::chrono::seconds(1));
+  ASSERT_THAT_ERROR(async_run.get().takeError(), llvm::Succeeded());
+  EXPECT_LT(std::chrono::steady_clock::now() - start, std::chrono::seconds(10));
+  EXPECT_TRUE(short_callback_called);
+  EXPECT_FALSE(long_callback_called);
+}
+
 #ifdef LLVM_ON_UNIX
 TEST_F(MainLoopTest, DetectsEOF) {
 
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 1e4c8f3ba0778..ae63e286cc155 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -14,6 +14,7 @@
 #include "lldb/Symbol/Type.h"
 #include "lldb/lldb-private-enumerations.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypePrinter.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -394,3 +395,127 @@ TEST(DWARFDIETest, GetContextInFunction) {
   EXPECT_THAT(foo_struct_die.GetTypeLookupContext(),
               testing::ElementsAre(make_struct("struct_t")));
 }
+
+TEST(DWARFDIETest, TestDWARFTypePrinter) {
+  // Make sure we can get template parameters and qualified names correctly with
+  // DWARFTypePrinter when using -gsimple-template-names.
+
+  // 0x0000000b: DW_TAG_compile_unit
+  // 0x0000000c:   DW_TAG_base_type
+  //                 DW_AT_name      ("int")
+  // 0x00000011:   DW_TAG_structure_type
+  //                 DW_AT_name      ("t1")
+  // 0x00000015:     DW_TAG_template_type_parameter
+  //                   DW_AT_type    (0x0000001f "t3<int>")
+  // 0x0000001a:     DW_TAG_structure_type
+  //                   DW_AT_name    ("t2")
+  // 0x0000001e:     NULL
+  // 0x0000001f:   DW_TAG_structure_type
+  //                 DW_AT_name      ("t3")
+  // 0x00000023:     DW_TAG_template_type_parameter
+  //                   DW_AT_type    (0x0000000c "int")
+  // 0x00000028:     NULL
+  // 0x00000029:   NULL
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_386
+DWARF:
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+        - Code:            0x2
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+        - Code:            0x4
+          Tag:             DW_TAG_template_type_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+        - Code:            0x5
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+        - Code:            0x6
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+        - Code:            0x7
+          Tag:             DW_TAG_template_type_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            int
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            t1
+        - AbbrCode:        0x4
+          Values:
+            - Value:            0x0000001f # update
+        - AbbrCode:        0x5
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            t2
+        - AbbrCode:        0x0
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            t3
+        - AbbrCode:        0x7
+          Values:
+            - Value:            0x0000000c # update
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0)";
+  YAMLModuleTester t(yamldata);
+  auto *symbol_file =
+      llvm::cast<SymbolFileDWARF>(t.GetModule()->GetSymbolFile());
+  DWARFUnit *unit = symbol_file->DebugInfo().GetUnitAtIndex(0);
+  std::string debug_str;
+  StreamString debug_os;
+  unit->Dump(&debug_os);
+  ASSERT_TRUE(unit);
+
+  DWARFDIE t1_die = unit->GetDIE(0x11);
+  std::string template_name;
+  llvm::raw_string_ostream template_name_os(template_name);
+  llvm::DWARFTypePrinter<DWARFDIE> template_name_printer(template_name_os);
+  template_name_printer.appendAndTerminateTemplateParameters(t1_die);
+  EXPECT_THAT(template_name, "<t3<int> >");
+
+  DWARFDIE t2_die = unit->GetDIE(0x1a);
+  std::string qualified_name;
+  llvm::raw_string_ostream qualified_name_os(qualified_name);
+  llvm::DWARFTypePrinter<DWARFDIE> qualified_name_printer(qualified_name_os);
+  qualified_name_printer.appendQualifiedName(t2_die);
+  EXPECT_THAT(qualified_name, "t1<t3<int> >::t2");
+}
diff --git a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
index ad2ebf7ffe1e2..4e5e0bb7dc355 100644
--- a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
+++ b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
@@ -29,15 +29,22 @@ TEST_F(ErrorDisplayTest, RenderStatus) {
   {
     // Test that diagnostics on the same column can be handled and all
     // three errors are diagnosed.
-    SourceLocation loc1 = {FileSpec{"a.c"}, 13, 11, 0, false, true};
-    SourceLocation loc2 = {FileSpec{"a.c"}, 13, 13, 0, false, true};
+    SourceLocation loc1 = {FileSpec{"a.c"}, 13, 5, 0, false, true};
+    SourceLocation loc2 = {FileSpec{"a.c"}, 13, 7, 0, false, true};
+    SourceLocation loc3 = {FileSpec{"a.c"}, 13, 9, 0, false, true};
     std::string result =
         Render({DiagnosticDetail{loc1, eSeverityError, "1", "1"},
-                DiagnosticDetail{loc1, eSeverityError, "2", "2"},
-                DiagnosticDetail{loc2, eSeverityError, "3", "3"}});
-    ASSERT_TRUE(StringRef(result).contains("error: 1"));
-    ASSERT_TRUE(StringRef(result).contains("error: 2"));
-    ASSERT_TRUE(StringRef(result).contains("error: 3"));
+                DiagnosticDetail{loc2, eSeverityError, "2a", "2a"},
+                DiagnosticDetail{loc2, eSeverityInfo, "2b", "2b"},
+                DiagnosticDetail{loc3, eSeverityError, "3", "3"}});
+    llvm::SmallVector<StringRef> lines;
+    StringRef(result).split(lines, '\n');
+    //                1234567890123
+    ASSERT_EQ(lines[0], "    ^ ^ ^");
+    ASSERT_EQ(lines[1], "    | | error: 3");
+    ASSERT_EQ(lines[2], "    | error: 2a");
+    ASSERT_EQ(lines[3], "    | note: 2b");
+    ASSERT_EQ(lines[4], "    error: 1");
   }
   {
     // Test that diagnostics in reverse order are emitted correctly.
@@ -68,15 +75,25 @@ TEST_F(ErrorDisplayTest, RenderStatus) {
     std::string result =
         Render({DiagnosticDetail{loc1, eSeverityError, "X", "X"},
                 DiagnosticDetail{loc2, eSeverityError, "Y", "Y"}});
-    auto lines = StringRef(result).split('\n');
-    auto line1 = lines.first;
-    lines = lines.second.split('\n');
-    auto line2 = lines.first;
-    lines = lines.second.split('\n');
-    auto line3 = lines.first;
-    //               1234567
-    ASSERT_EQ(line1, "^~~ ^~~");
-    ASSERT_EQ(line2, "|   error: Y");
-    ASSERT_EQ(line3, "error: X");
+    llvm::SmallVector<StringRef> lines;
+    StringRef(result).split(lines, '\n');
+    //                1234567
+    ASSERT_EQ(lines[0], "^~~ ^~~");
+    ASSERT_EQ(lines[1], "|   error: Y");
+    ASSERT_EQ(lines[2], "error: X");
+  }
+  {
+    // Test diagnostics on the same line are emitted correctly.
+    SourceLocation loc1 = {FileSpec{"a.c"}, 1, 2, 0, false, true};
+    SourceLocation loc2 = {FileSpec{"a.c"}, 1, 6, 0, false, true};
+    std::string result =
+        Render({DiagnosticDetail{loc1, eSeverityError, "X", "X"},
+                DiagnosticDetail{loc2, eSeverityError, "Y", "Y"}});
+    llvm::SmallVector<StringRef> lines;
+    StringRef(result).split(lines, '\n');
+    //                1234567
+    ASSERT_EQ(lines[0], " ^   ^");
+    ASSERT_EQ(lines[1], " |   error: Y");
+    ASSERT_EQ(lines[2], " error: X");
   }
 }
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 74b72c9825341..cfcf1404d82b7 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT PACKAGE_VERSION)
 endif()
 
 if(NOT DEFINED LLVM_SHLIB_SYMBOL_VERSION)
-  # "Symbol version prefix for libLLVM.so"
+  # "Symbol version prefix for libLLVM.so and libclang-cpp.so"
   set(LLVM_SHLIB_SYMBOL_VERSION "LLVM_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
 endif()
 
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 8ad070bfd433c..695728a0d5817 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -121,8 +121,10 @@ evan.cheng@apple.com (email)
 
 #### SelectionDAG
 
-Justin Bogner \
-mail@justinbogner.com (email), [bogner](https://github.com/bogner) (GitHub)
+Simon Pilgrim \
+llvm-dev@redking.me.uk (email), [RKSimon](https://github.com/RKSimon) (GitHub) \
+Craig Topper \
+craig.topper@sifive.com (email), [topperc](https://github.com/topperc) (GitHub)
 
 #### FastISel
 
@@ -199,7 +201,7 @@ mail@justinbogner.com (email), [bogner](https://github.com/bogner) (GitHub)
 #### Hexagon backend
 
 Sundeep Kushwaha \
-sundeepk@quicinc.com (email)
+sundeepk@quicinc.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub)
 
 #### Lanai backend
 
@@ -416,17 +418,17 @@ Others only have a lead maintainer listed here.
 
 [Bolt maintainers](https://github.com/llvm/llvm-project/blob/main/bolt/CODE_OWNERS.TXT)
 
-[Clang maintainers](https://github.com/llvm/llvm-project/blob/main/clang/CodeOwners.rst)
+[Clang maintainers](https://github.com/llvm/llvm-project/blob/main/clang/Maintainers.rst)
 
 [Clang-tools-extra maintainers](https://github.com/llvm/llvm-project/blob/main/clang-tools-extra/CODE_OWNERS.TXT)
 
 [Compiler-rt maintainers](https://github.com/llvm/llvm-project/blob/main/compiler-rt/CODE_OWNERS.TXT)
 
-[Flang maintainers](https://github.com/llvm/llvm-project/blob/main/flang/CODE_OWNERS.TXT)
+[Flang maintainers](https://github.com/llvm/llvm-project/blob/main/flang/Maintainers.txt)
 
 [LLD maintainers](https://github.com/llvm/llvm-project/blob/main/lld/CODE_OWNERS.TXT)
 
-[LLDB maintainers](https://github.com/llvm/llvm-project/blob/main/lldb/CodeOwners.rst)
+[LLDB maintainers](https://github.com/llvm/llvm-project/blob/main/lldb/Maintainers.rst)
 
 #### libc++
 
@@ -461,7 +463,8 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 
 ### Inactive or former component maintainers
 
-Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management
+Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
+Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
 
 ### Former maintainers of removed components
 
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 86f2bac7d23e8..64878d28d9e1e 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -40,6 +40,23 @@ if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
           list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_FILE_OFFSET_BITS=64")
 endif()
 
+# Newer POSIX functions aren't available without the appropriate defines.
+# Usually those are set by the use of -std=gnuXX, but one can also use the
+# newer functions with -std=c(++)XX, i.e. without the GNU language extensions.
+# Keep this at the top to make sure we don't add _GNU_SOURCE dependent checks
+# before adding it.
+check_symbol_exists(__GLIBC__ stdio.h LLVM_USING_GLIBC)
+if(LLVM_USING_GLIBC)
+  add_compile_definitions(_GNU_SOURCE)
+  list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_GNU_SOURCE")
+
+  # enable 64bit off_t on 32bit systems using glibc
+  if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    add_compile_definitions(_FILE_OFFSET_BITS=64)
+    list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_FILE_OFFSET_BITS=64")
+  endif()
+endif()
+
 # include checks
 check_include_file(dlfcn.h HAVE_DLFCN_H)
 check_include_file(errno.h HAVE_ERRNO_H)
@@ -336,17 +353,6 @@ else()
       "sys/types.h;sys/stat.h" HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC)
 endif()
 
-check_symbol_exists(__GLIBC__ stdio.h LLVM_USING_GLIBC)
-if( LLVM_USING_GLIBC )
-  add_compile_definitions(_GNU_SOURCE)
-  list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_GNU_SOURCE")
-# enable 64bit off_t on 32bit systems using glibc
-  if (CMAKE_SIZEOF_VOID_P EQUAL 4)
-    add_compile_definitions(_FILE_OFFSET_BITS=64)
-    list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_FILE_OFFSET_BITS=64")
-  endif()
-endif()
-
 # This check requires _GNU_SOURCE.
 if (NOT PURE_WINDOWS)
   if (LLVM_PTHREAD_LIB)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c180ca5fcebef..a25b6feddbedd 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -399,6 +399,13 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                         work-item
                                                                         IDs
 
+     ``gfx950``                  ``amdgcn``   dGPU  - sramecc         - Architected                   *TBA*
+                                                    - tgsplit           flat
+                                                    - xnack             scratch                       .. TODO::
+                                                    - kernarg preload - Packed
+                                                                        work-item                       Add product
+                                                                        IDs                             names.
+
      **GCN GFX10.1 (RDNA 1)** [AMD-GCN-GFX10-RDNA1]_
      -----------------------------------------------------------------------------------------------------------------------
      ``gfx1010``                 ``amdgcn``   dGPU  - cumode          - Absolute      - *rocm-amdhsa* - Radeon RX 5700
@@ -2178,7 +2185,7 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX942``           0x04c      ``gfx942``
      *reserved*                                 0x04d      Reserved.
      ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
-     *reserved*                                 0x04f      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX950``           0x04f      ``gfx950``
      *reserved*                                 0x050      Reserved.
      ``EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC``     0x051      ``gfx9-generic``
      ``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC``  0x052      ``gfx10-1-generic``
@@ -5468,6 +5475,8 @@ The fields used by CP for code objects before V3 also match those specified in
                                                        roundup(lds-size / (64 * 4))
                                                      GFX7-GFX11
                                                        roundup(lds-size / (128 * 4))
+                                                     GFX950
+                                                       roundup(lds-size / (320 * 4))
 
      24      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    Wavefront starts execution
                      _INVALID_OPERATION              with specified exceptions
diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst
index c50e100443cf3..6614d036a014e 100644
--- a/llvm/docs/DeveloperPolicy.rst
+++ b/llvm/docs/DeveloperPolicy.rst
@@ -533,13 +533,13 @@ Obtaining Commit Access
 -----------------------
 
 We grant commit access to contributors that can provide a valid justification.
-If you would like commit access, please send an email to
-`Chris <mailto:clattner@llvm.org>`_ with your GitHub username.  This is true
-for former contributors with SVN access as well as new contributors. If
-approved, a GitHub invitation will be sent to your GitHub account. In case you
-don't get notification from GitHub, go to
+If you would like commit access, please use this `link <https://github.com/llvm/llvm-project/issues/new?title=Request%20Commit%20Access%20For%20%3Cuser%3E&body=%23%23%23%20Why%20Are%20you%20requesting%20commit%20access%20?>`_ to file
+an issue and request commit access. Replace the <user> string in the title
+with your github username, and explain why you are requesting commit access in
+the issue description. If approved, a GitHub invitation will be sent to your
+GitHub account. In case you don't get notification from GitHub, go to
 `Invitation Link <https://github.com/orgs/llvm/invitation>`_ directly. Once
-accept the invitation, you'll get commit access.
+you accept the invitation, you'll get commit access.
 
 Prior to obtaining commit access, it is common practice to request that
 someone with commit access commits on your behalf. When doing so, please
diff --git a/llvm/docs/HowToAddABuilder.rst b/llvm/docs/HowToAddABuilder.rst
index 9a0f2712f31f2..3984ef692b6ca 100644
--- a/llvm/docs/HowToAddABuilder.rst
+++ b/llvm/docs/HowToAddABuilder.rst
@@ -6,7 +6,7 @@ Introduction
 ============
 
 This document contains information about adding a build configuration and
-buildbot-worker to private worker builder to LLVM Buildbot Infrastructure.
+buildbot worker to the LLVM Buildbot Infrastructure.
 
 .. note:: The term "buildmaster" is used in this document to refer to the
   server that manages which builds are run and where. Though we would not
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 03c758c4e7f4b..d0b34c5958e02 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -190,6 +190,7 @@ Changes to the RISC-V Backend
 * The `Zvbc32e` and `Zvkgs` extensions are now supported experimentally.
 * Added `Smctr`, `Ssctr` and `Svvptc` extensions.
 * `-mcpu=syntacore-scr7` was added.
+* `-mcpu=tt-ascalon-d8` was added.
 * The `Zacas` extension is no longer marked as experimental.
 * Added Smdbltrp, Ssdbltrp extensions to -march.
 * The `Smmpm`, `Smnpm`, `Ssnpm`, `Supm`, and `Sspm` pointer masking extensions
@@ -197,6 +198,20 @@ Changes to the RISC-V Backend
 * The `Sha` extension is now supported.
 * The RVA23U64, RVA23S64, RVB23U64, and RVB23S64 profiles are no longer marked
   as experimental.
+* `.insn <length>, <raw encoding>` can be used to assemble 48- and 64-bit
+  instructions from raw integer values.
+* `.insn [<length>,] <raw encoding>` now accepts absolute expressions for both
+  expressions, so that they can be computed from constants and absolute symbols.
+* The following new inline assembly constraints and modifiers are accepted:
+  * `cr` constraint meaning an RVC-encoding compatible GPR (`x8`-`x15`)
+  * `cf` constraint meaning an RVC-encoding compatible FPR (`f8`-`f15`)
+  * `R` constraint meaning an even-odd GPR pair (prints as the even register,
+    but both registers in the pair are considered live).
+  * `N` modifer meaning print the register encoding (0-31) rather than the name.
+* `f` and `cf` inline assembly constraints, when using F-/D-/H-in-X extensions,
+  will use the relevant GPR rather than FPR. This makes inline assembly portable
+  between e.g. F and Zfinx code.
+
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -311,6 +326,28 @@ Changes to the LLVM tools
 Changes to LLDB
 ---------------------------------
 
+* LLDB now now supports inline diagnostics for the expression evaluator and command line parser.
+
+  Old:
+  ```
+  (lldb) p a+b
+  error: <user expression 0>:1:1: use of undeclared identifier 'a'
+      1 | a+b
+        | ^
+  error: <user expression 0>:1:3: use of undeclared identifier 'b'
+      1 | a+b
+        |   ^
+  ```
+
+  New:
+
+  ```
+  (lldb) p a+b
+           ˄ ˄
+           │ ╰─ error: use of undeclared identifier 'b'
+           ╰─ error: use of undeclared identifier 'a'
+  ```
+
 * LLDB can now read the `fpmr` register from AArch64 Linux processes and core
   files.
 
@@ -319,6 +356,8 @@ Changes to LLDB
 
 * A new setting `target.launch-working-dir` can be used to set a persistent cwd that is used by default by `process launch` and `run`.
 
+* LLDB now parses shared libraries in parallel, resulting in an average 2x speedup when attaching (only available on Darwin platforms) and launching (available on all platforms).
+
 Changes to BOLT
 ---------------------------------
 
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 9c6084d2d9dee..60875577561dc 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -144,9 +144,9 @@ static_assert(sizeof(AliasResult) == 4,
 /// << operator for AliasResult.
 raw_ostream &operator<<(raw_ostream &OS, AliasResult AR);
 
-/// Virtual base class for providers of capture information.
-struct CaptureInfo {
-  virtual ~CaptureInfo() = 0;
+/// Virtual base class for providers of capture analysis.
+struct CaptureAnalysis {
+  virtual ~CaptureAnalysis() = 0;
 
   /// Check whether Object is not captured before instruction I. If OrAt is
   /// true, captures by instruction I itself are also considered.
@@ -156,10 +156,10 @@ struct CaptureInfo {
                                    bool OrAt) = 0;
 };
 
-/// Context-free CaptureInfo provider, which computes and caches whether an
+/// Context-free CaptureAnalysis provider, which computes and caches whether an
 /// object is captured in the function at all, but does not distinguish whether
 /// it was captured before or after the context instruction.
-class SimpleCaptureInfo final : public CaptureInfo {
+class SimpleCaptureAnalysis final : public CaptureAnalysis {
   SmallDenseMap<const Value *, bool, 8> IsCapturedCache;
 
 public:
@@ -167,10 +167,10 @@ class SimpleCaptureInfo final : public CaptureInfo {
                            bool OrAt) override;
 };
 
-/// Context-sensitive CaptureInfo provider, which computes and caches the
+/// Context-sensitive CaptureAnalysis provider, which computes and caches the
 /// earliest common dominator closure of all captures. It provides a good
 /// approximation to a precise "captures before" analysis.
-class EarliestEscapeInfo final : public CaptureInfo {
+class EarliestEscapeAnalysis final : public CaptureAnalysis {
   DominatorTree &DT;
   const LoopInfo *LI;
 
@@ -185,7 +185,7 @@ class EarliestEscapeInfo final : public CaptureInfo {
   DenseMap<Instruction *, TinyPtrVector<const Value *>> Inst2Obj;
 
 public:
-  EarliestEscapeInfo(DominatorTree &DT, const LoopInfo *LI = nullptr)
+  EarliestEscapeAnalysis(DominatorTree &DT, const LoopInfo *LI = nullptr)
       : DT(DT), LI(LI) {}
 
   bool isNotCapturedBefore(const Value *Object, const Instruction *I,
@@ -265,7 +265,7 @@ class AAQueryInfo {
   using AliasCacheT = SmallDenseMap<LocPair, CacheEntry, 8>;
   AliasCacheT AliasCache;
 
-  CaptureInfo *CI;
+  CaptureAnalysis *CA;
 
   /// Query depth used to distinguish recursive queries.
   unsigned Depth = 0;
@@ -298,15 +298,15 @@ class AAQueryInfo {
   /// passes that lazily update the DT while performing AA queries.
   bool UseDominatorTree = true;
 
-  AAQueryInfo(AAResults &AAR, CaptureInfo *CI) : AAR(AAR), CI(CI) {}
+  AAQueryInfo(AAResults &AAR, CaptureAnalysis *CA) : AAR(AAR), CA(CA) {}
 };
 
-/// AAQueryInfo that uses SimpleCaptureInfo.
+/// AAQueryInfo that uses SimpleCaptureAnalysis.
 class SimpleAAQueryInfo : public AAQueryInfo {
-  SimpleCaptureInfo CI;
+  SimpleCaptureAnalysis CA;
 
 public:
-  SimpleAAQueryInfo(AAResults &AAR) : AAQueryInfo(AAR, &CI) {}
+  SimpleAAQueryInfo(AAResults &AAR) : AAQueryInfo(AAR, &CA) {}
 };
 
 class BatchAAResults;
@@ -630,11 +630,12 @@ class AAResults {
 class BatchAAResults {
   AAResults &AA;
   AAQueryInfo AAQI;
-  SimpleCaptureInfo SimpleCI;
+  SimpleCaptureAnalysis SimpleCA;
 
 public:
-  BatchAAResults(AAResults &AAR) : AA(AAR), AAQI(AAR, &SimpleCI) {}
-  BatchAAResults(AAResults &AAR, CaptureInfo *CI) : AA(AAR), AAQI(AAR, CI) {}
+  BatchAAResults(AAResults &AAR) : AA(AAR), AAQI(AAR, &SimpleCA) {}
+  BatchAAResults(AAResults &AAR, CaptureAnalysis *CA)
+      : AA(AAR), AAQI(AAR, CA) {}
 
   AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
     return AA.alias(LocA, LocB, AAQI);
diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h
index 700c3b0f18b8d..18fb7377ff667 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -247,37 +247,30 @@ class DefaultInlineAdvisor : public InlineAdvisor {
 ///
 /// namespace {
 ///
-/// InlineAdvisor *defaultAdvisorFactory(Module &M, FunctionAnalysisManager
-/// &FAM,
-///                                      InlineParams Params, InlineContext IC)
-///                                      {
+/// InlineAdvisor *defaultAdvisorFactory(Module &M,
+///                                      FunctionAnalysisManager &FAM,
+///                                      InlineParams Params,
+///                                      InlineContext IC) {
 ///   return new DefaultInlineAdvisor(M, FAM, Params, IC);
 /// }
 ///
-/// struct DefaultDynamicAdvisor : PassInfoMixin<DefaultDynamicAdvisor> {
-///   PreservedAnalyses run(Module &, ModuleAnalysisManager &MAM) {
-///     PluginInlineAdvisorAnalysis PA(defaultAdvisorFactory);
-///     MAM.registerPass([&] { return PA; });
-///     return PreservedAnalyses::all();
-///   }
-/// };
-///
 /// } // namespace
 ///
 /// extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo
 /// llvmGetPassPluginInfo() {
 ///   return {LLVM_PLUGIN_API_VERSION, "DynamicDefaultAdvisor",
-///   LLVM_VERSION_STRING,
+///           LLVM_VERSION_STRING,
 ///           [](PassBuilder &PB) {
-///             PB.registerPipelineStartEPCallback(
-///                 [](ModulePassManager &MPM, OptimizationLevel Level) {
-///                   MPM.addPass(DefaultDynamicAdvisor());
+///             PB.registerAnalysisRegistrationCallback(
+///                 [](ModuleAnalysisManager &MAM) {
+///                   PluginInlineAdvisorAnalysis PA(defaultAdvisorFactory);
+///                   MAM.registerPass([&] { return PA; });
 ///                 });
 ///           }};
 /// }
 ///
 /// A plugin must implement an AdvisorFactory and register it with a
-/// PluginInlineAdvisorAnlysis to the provided ModuleanAlysisManager.
+/// PluginInlineAdvisorAnlysis to the provided ModuleAnalysisManager.
 ///
 /// If such a plugin has been registered
 /// InlineAdvisorAnalysis::Result::tryCreate will return the dynamically loaded
@@ -287,7 +280,6 @@ class PluginInlineAdvisorAnalysis
     : public AnalysisInfoMixin<PluginInlineAdvisorAnalysis> {
 public:
   static AnalysisKey Key;
-  static bool HasBeenRegistered;
 
   typedef InlineAdvisor *(*AdvisorFactory)(Module &M,
                                            FunctionAnalysisManager &FAM,
@@ -295,7 +287,6 @@ class PluginInlineAdvisorAnalysis
                                            InlineContext IC);
 
   PluginInlineAdvisorAnalysis(AdvisorFactory Factory) : Factory(Factory) {
-    HasBeenRegistered = true;
     assert(Factory != nullptr &&
            "The plugin advisor factory should not be a null pointer.");
   }
diff --git a/llvm/include/llvm/Analysis/InlineOrder.h b/llvm/include/llvm/Analysis/InlineOrder.h
index 2fa2d6091303a..498cef314b5c3 100644
--- a/llvm/include/llvm/Analysis/InlineOrder.h
+++ b/llvm/include/llvm/Analysis/InlineOrder.h
@@ -59,7 +59,6 @@ class PluginInlineOrderAnalysis
                            ModuleAnalysisManager &MAM, Module &M);
 
   PluginInlineOrderAnalysis(InlineOrderFactory Factory) : Factory(Factory) {
-    HasBeenRegistered = true;
     assert(Factory != nullptr &&
            "The plugin inline order factory should not be a null pointer.");
   }
@@ -71,11 +70,7 @@ class PluginInlineOrderAnalysis
   Result run(Module &, ModuleAnalysisManager &) { return {Factory}; }
   Result getResult() { return {Factory}; }
 
-  static bool isRegistered() { return HasBeenRegistered; }
-  static void unregister() { HasBeenRegistered = false; }
-
 private:
-  static bool HasBeenRegistered;
   InlineOrderFactory Factory;
 };
 
diff --git a/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h b/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
index c31e663498d5f..d6288f2aa061e 100644
--- a/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -355,7 +355,7 @@ class MemoryDependenceResults {
   const TargetLibraryInfo &TLI;
   DominatorTree &DT;
   PredIteratorCache PredCache;
-  EarliestEscapeInfo EII;
+  EarliestEscapeAnalysis EEA;
 
   unsigned DefaultBlockScanLimit;
 
@@ -367,7 +367,7 @@ class MemoryDependenceResults {
   MemoryDependenceResults(AAResults &AA, AssumptionCache &AC,
                           const TargetLibraryInfo &TLI, DominatorTree &DT,
                           unsigned DefaultBlockScanLimit)
-      : AA(AA), AC(AC), TLI(TLI), DT(DT), EII(DT),
+      : AA(AA), AC(AC), TLI(TLI), DT(DT), EEA(DT),
         DefaultBlockScanLimit(DefaultBlockScanLimit) {}
 
   /// Handle invalidation in the new PM.
diff --git a/llvm/include/llvm/Analysis/MemorySSAUpdater.h b/llvm/include/llvm/Analysis/MemorySSAUpdater.h
index d4da3ef1146db..055feceefb054 100644
--- a/llvm/include/llvm/Analysis/MemorySSAUpdater.h
+++ b/llvm/include/llvm/Analysis/MemorySSAUpdater.h
@@ -190,7 +190,8 @@ class MemorySSAUpdater {
   /// inaccessible and it *must* have removeMemoryAccess called on it.
   MemoryAccess *createMemoryAccessInBB(Instruction *I, MemoryAccess *Definition,
                                        const BasicBlock *BB,
-                                       MemorySSA::InsertionPlace Point);
+                                       MemorySSA::InsertionPlace Point,
+                                       bool CreationMustSucceed = true);
 
   /// Create a MemoryAccess in MemorySSA before an existing MemoryAccess.
   ///
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ccace59d6d36..e37bce3118bcb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -630,6 +630,10 @@ class TargetTransformInfo {
                                 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
                                 HardwareLoopInfo &HWLoopInfo) const;
 
+  // Query the target for which minimum vectorization factor epilogue
+  // vectorization should be considered.
+  unsigned getEpilogueVectorizationMinVF() const;
+
   /// Query the target whether it would be prefered to create a predicated
   /// vector loop, which can avoid the need to emit a scalar epilogue loop.
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const;
@@ -1912,6 +1916,7 @@ class TargetTransformInfo::Concept {
                                         AssumptionCache &AC,
                                         TargetLibraryInfo *LibInfo,
                                         HardwareLoopInfo &HWLoopInfo) = 0;
+  virtual unsigned getEpilogueVectorizationMinVF() = 0;
   virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0;
   virtual TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0;
@@ -2392,6 +2397,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                 HardwareLoopInfo &HWLoopInfo) override {
     return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
   }
+  unsigned getEpilogueVectorizationMinVF() override {
+    return Impl.getEpilogueVectorizationMinVF();
+  }
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) override {
     return Impl.preferPredicateOverEpilogue(TFI);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c3c5629d61c91..72038c090b792 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -199,6 +199,8 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  unsigned getEpilogueVectorizationMinVF() const { return 16; }
+
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; }
 
   TailFoldingStyle
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 6c05ea7208e1f..fd32a6ec19652 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -811,7 +811,7 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX942          = 0x04c,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D   = 0x04d,
   EF_AMDGPU_MACH_AMDGCN_GFX1201         = 0x04e,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F   = 0x04f,
+  EF_AMDGPU_MACH_AMDGCN_GFX950          = 0x04f,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50   = 0x050,
   EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC    = 0x051,
   EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052,
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
index 161b1969abfeb..43473d47e3281 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
@@ -43,4 +43,4 @@ ELF_RELOC(R_X86_64_TLSDESC,     36)
 ELF_RELOC(R_X86_64_IRELATIVE,   37)
 ELF_RELOC(R_X86_64_GOTPCRELX,   41)
 ELF_RELOC(R_X86_64_REX_GOTPCRELX,    42)
-ELF_RELOC(R_X86_64_REX2_GOTPCRELX,    43)
+ELF_RELOC(R_X86_64_CODE_4_GOTPCRELX,    43)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 36df9ee2e7d94..3b098c42f2741 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -666,6 +666,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
   }
 
+  unsigned getEpilogueVectorizationMinVF() {
+    return BaseT::getEpilogueVectorizationMinVF();
+  }
+
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
     return BaseT::preferPredicateOverEpilogue(TFI);
   }
@@ -1574,6 +1578,67 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     if (Intrinsic::isTargetIntrinsic(IID))
       return TargetTransformInfo::TCC_Basic;
 
+    // VP Intrinsics should have the same cost as their non-vp counterpart.
+    // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
+    // counterpart when the vector length argument is smaller than the maximum
+    // vector length.
+    // TODO: Support other kinds of VPIntrinsics
+    if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
+      std::optional<unsigned> FOp =
+          VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
+      if (FOp) {
+        if (ICA.getID() == Intrinsic::vp_load) {
+          Align Alignment;
+          if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+            Alignment = VPI->getPointerAlignment().valueOrOne();
+          unsigned AS = 0;
+          if (ICA.getArgTypes().size() > 1)
+            if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0]))
+              AS = PtrTy->getAddressSpace();
+          return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
+                                          AS, CostKind);
+        }
+        if (ICA.getID() == Intrinsic::vp_store) {
+          Align Alignment;
+          if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+            Alignment = VPI->getPointerAlignment().valueOrOne();
+          unsigned AS = 0;
+          if (ICA.getArgTypes().size() >= 2)
+            if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1]))
+              AS = PtrTy->getAddressSpace();
+          return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment,
+                                          AS, CostKind);
+        }
+        if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
+          return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
+                                                 CostKind);
+        }
+      }
+
+      std::optional<Intrinsic::ID> FID =
+          VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
+      if (FID) {
+        // Non-vp version will have same arg types except mask and vector
+        // length.
+        assert(ICA.getArgTypes().size() >= 2 &&
+               "Expected VPIntrinsic to have Mask and Vector Length args and "
+               "types");
+        ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
+
+        // VPReduction intrinsics have a start value argument that their non-vp
+        // counterparts do not have, except for the fadd and fmul non-vp
+        // counterpart.
+        if (VPReductionIntrinsic::isVPReduction(ICA.getID()) &&
+            *FID != Intrinsic::vector_reduce_fadd &&
+            *FID != Intrinsic::vector_reduce_fmul)
+          NewTys = NewTys.drop_front();
+
+        IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
+                                       ICA.getFlags());
+        return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
+      }
+    }
+
     if (ICA.isTypeBasedOnly())
       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
 
@@ -1834,68 +1899,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     }
     }
 
-    // VP Intrinsics should have the same cost as their non-vp counterpart.
-    // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
-    // counterpart when the vector length argument is smaller than the maximum
-    // vector length.
-    // TODO: Support other kinds of VPIntrinsics
-    if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
-      std::optional<unsigned> FOp =
-          VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
-      if (FOp) {
-        if (ICA.getID() == Intrinsic::vp_load) {
-          Align Alignment;
-          if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
-            Alignment = VPI->getPointerAlignment().valueOrOne();
-          unsigned AS = 0;
-          if (ICA.getArgs().size() > 1)
-            if (auto *PtrTy =
-                    dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
-              AS = PtrTy->getAddressSpace();
-          return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
-                                          AS, CostKind);
-        }
-        if (ICA.getID() == Intrinsic::vp_store) {
-          Align Alignment;
-          if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
-            Alignment = VPI->getPointerAlignment().valueOrOne();
-          unsigned AS = 0;
-          if (ICA.getArgs().size() >= 2)
-            if (auto *PtrTy =
-                    dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
-              AS = PtrTy->getAddressSpace();
-          return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
-                                          AS, CostKind);
-        }
-        if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) {
-          return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
-                                                 CostKind);
-        }
-      }
-
-      std::optional<Intrinsic::ID> FID =
-          VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID());
-      if (FID) {
-        // Non-vp version will have same Args/Tys except mask and vector length.
-        assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
-               "Expected VPIntrinsic to have Mask and Vector Length args and "
-               "types");
-        ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2);
-
-        // VPReduction intrinsics have a start value argument that their non-vp
-        // counterparts do not have, except for the fadd and fmul non-vp
-        // counterpart.
-        if (VPReductionIntrinsic::isVPReduction(ICA.getID()) &&
-            *FID != Intrinsic::vector_reduce_fadd &&
-            *FID != Intrinsic::vector_reduce_fmul)
-          NewTys = NewTys.drop_front();
-
-        IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
-                                       ICA.getFlags());
-        return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
-      }
-    }
-
     // Assume that we need to scalarize this intrinsic.)
     // Compute the scalarization overhead based on Args for a vector
     // intrinsic.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b1232a368a365..55c3b72c8e027 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -939,6 +939,9 @@ class CombinerHelper {
   // merge_values(_, zero) -> zext
   bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  // overflow sub
+  bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 4de14dee190fb..9e5d4d34f24d2 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -486,6 +486,23 @@ class GAddCarryOut : public GBinOpCarryOut {
   }
 };
 
+/// Represents overflowing sub operations.
+/// G_USUBO, G_SSUBO
+class GSubCarryOut : public GBinOpCarryOut {
+public:
+  bool isSigned() const { return getOpcode() == TargetOpcode::G_SSUBO; }
+
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_USUBO:
+    case TargetOpcode::G_SSUBO:
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+
 /// Represents overflowing add/sub operations that also consume a carry-in.
 /// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
 class GAddSubCarryInOut : public GAddSubCarryOut {
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 69c91835a4d9a..2e98a4a397147 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -226,6 +226,8 @@ class DWARFDie {
 
   bool addressRangeContainsAddress(const uint64_t Address) const;
 
+  std::optional<uint64_t> getLanguage() const;
+
   Expected<DWARFLocationExpressionsVector>
   getLocations(dwarf::Attribute Attr) const;
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h
index 87e876273c4b9..962462b827825 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFTypePrinter.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/Error.h"
 
 #include <string>
 
@@ -107,13 +108,11 @@ void DWARFTypePrinter<DieType>::appendArrayType(const DieType &D) {
     if (std::optional<typename DieType::DWARFFormValue> UpperV =
             C.find(dwarf::DW_AT_upper_bound))
       UB = UpperV->getAsUnsignedConstant();
-    if (std::optional<typename DieType::DWARFFormValue> LV =
-            D.getDwarfUnit()->getUnitDIE().find(dwarf::DW_AT_language))
-      if (std::optional<uint64_t> LC = LV->getAsUnsignedConstant())
-        if ((DefaultLB =
-                 LanguageLowerBound(static_cast<dwarf::SourceLanguage>(*LC))))
-          if (LB && *LB == *DefaultLB)
-            LB = std::nullopt;
+    if (std::optional<uint64_t> LV = D.getLanguage())
+      if ((DefaultLB =
+               LanguageLowerBound(static_cast<dwarf::SourceLanguage>(*LV))))
+        if (LB && *LB == *DefaultLB)
+          LB = std::nullopt;
     if (!LB && !Count && !UB)
       OS << "[]";
     else if (!LB && (Count || UB) && DefaultLB)
@@ -150,6 +149,16 @@ template <typename DieType>
 DieType resolveReferencedType(DieType D, typename DieType::DWARFFormValue F) {
   return D.resolveReferencedType(F);
 }
+template <typename DWARFFormValueType>
+const char *toString(std::optional<DWARFFormValueType> F) {
+  if (F) {
+    llvm::Expected<const char *> E = F->getAsCString();
+    if (E)
+      return *E;
+    llvm::consumeError(E.takeError());
+  }
+  return nullptr;
+}
 } // namespace detail
 
 template <typename DieType>
@@ -239,7 +248,7 @@ DieType DWARFTypePrinter<DieType>::appendUnqualifiedNameBefore(
     appendConstVolatileQualifierBefore(D);
     break;
   case dwarf::DW_TAG_namespace: {
-    if (const char *Name = dwarf::toString(D.find(dwarf::DW_AT_name), nullptr))
+    if (const char *Name = detail::toString(D.find(dwarf::DW_AT_name)))
       OS << Name;
     else
       OS << "(anonymous namespace)";
@@ -261,7 +270,7 @@ DieType DWARFTypePrinter<DieType>::appendUnqualifiedNameBefore(
   case DW_TAG_base_type:
   */
   default: {
-    const char *NamePtr = dwarf::toString(D.find(dwarf::DW_AT_name), nullptr);
+    const char *NamePtr = detail::toString(D.find(dwarf::DW_AT_name));
     if (!NamePtr) {
       appendTypeTagName(D.getTag());
       return DieType();
@@ -440,7 +449,7 @@ bool DWARFTypePrinter<DieType>::appendTemplateParameters(DieType D,
       if (T.getTag() == dwarf::DW_TAG_pointer_type ||
           T.getTag() == dwarf::DW_TAG_reference_type)
         continue;
-      const char *RawName = dwarf::toString(T.find(dwarf::DW_AT_name), nullptr);
+      const char *RawName = detail::toString(T.find(dwarf::DW_AT_name));
       assert(RawName);
       StringRef Name = RawName;
       auto V = C.find(dwarf::DW_AT_const_value);
@@ -533,7 +542,7 @@ bool DWARFTypePrinter<DieType>::appendTemplateParameters(DieType D,
     }
     if (C.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
       const char *RawName =
-          dwarf::toString(C.find(dwarf::DW_AT_GNU_template_name), nullptr);
+          detail::toString(C.find(dwarf::DW_AT_GNU_template_name));
       assert(RawName);
       StringRef Name = RawName;
       Sep();
@@ -593,7 +602,7 @@ void DWARFTypePrinter<DieType>::appendConstVolatileQualifierAfter(DieType N) {
   decomposeConstVolatile(N, T, C, V);
   if (T && T.getTag() == dwarf::DW_TAG_subroutine_type)
     appendSubroutineNameAfter(T, detail::resolveReferencedType(T), false,
-                              C.isValid(), V.isValid());
+                              static_cast<bool>(C), static_cast<bool>(V));
   else
     appendUnqualifiedNameAfter(T, detail::resolveReferencedType(T));
 }
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
index b7b98d55cc65b..c9595ed4d99e7 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
@@ -20,6 +20,9 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <cassert>
+#if __has_feature(ptrauth_calls)
+#include <ptrauth.h>
+#endif
 #include <type_traits>
 
 namespace llvm {
@@ -33,12 +36,40 @@ class ExecutorAddr {
   /// A wrap/unwrap function that leaves pointers unmodified.
   template <typename T> using rawPtr = llvm::identity<T *>;
 
+#if __has_feature(ptrauth_calls)
+  template <typename T> class PtrauthSignDefault {
+  public:
+    constexpr T *operator()(T *P) {
+      if (std::is_function_v<T>)
+        return ptrauth_sign_unauthenticated(P, ptrauth_key_function_pointer, 0);
+      else
+        return P;
+    }
+  };
+
+  template <typename T> class PtrauthStripDefault {
+  public:
+    constexpr T *operator()(T *P) {
+      return ptrauth_strip(P, ptrauth_key_function_pointer);
+    }
+  };
+
+  /// Default wrap function to use on this host.
+  template <typename T> using defaultWrap = PtrauthSignDefault<T>;
+
+  /// Default unwrap function to use on this host.
+  template <typename T> using defaultUnwrap = PtrauthStripDefault<T>;
+
+#else
+
   /// Default wrap function to use on this host.
   template <typename T> using defaultWrap = rawPtr<T>;
 
   /// Default unwrap function to use on this host.
   template <typename T> using defaultUnwrap = rawPtr<T>;
 
+#endif
+
   /// Merges a tag into the raw address value:
   ///   P' = P | (TagValue << TagOffset).
   class Tag {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.h b/llvm/include/llvm/Frontend/OpenMP/OMP.h
index 0d79c071ecd30..dd771ac3b416f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.h
@@ -47,6 +47,8 @@ static constexpr inline bool canHaveIterator(Clause C) {
   }
 }
 
+ArrayRef<unsigned> getOpenMPVersions();
+
 /// Create a nicer version of a function name for humans to look at.
 std::string prettifyFunctionName(StringRef FunctionName);
 
diff --git a/llvm/include/llvm/FuzzMutate/OpDescriptor.h b/llvm/include/llvm/FuzzMutate/OpDescriptor.h
index 78114074dbbfc..4a3c2f767d00c 100644
--- a/llvm/include/llvm/FuzzMutate/OpDescriptor.h
+++ b/llvm/include/llvm/FuzzMutate/OpDescriptor.h
@@ -63,7 +63,7 @@ class SourcePred {
       // Default filter just calls Pred on each of the base types.
       std::vector<Constant *> Result;
       for (Type *T : BaseTypes) {
-        Constant *V = UndefValue::get(T);
+        Constant *V = PoisonValue::get(T);
         if (Pred(Cur, V))
           makeConstantsWithType(T, Result);
       }
@@ -155,7 +155,7 @@ static inline SourcePred anyPtrType() {
     std::vector<Constant *> Result;
     // TODO: Should these point at something?
     for (Type *T : Ts)
-      Result.push_back(UndefValue::get(PointerType::getUnqual(T)));
+      Result.push_back(PoisonValue::get(PointerType::getUnqual(T)));
     return Result;
   };
   return {Pred, Make};
@@ -175,7 +175,7 @@ static inline SourcePred sizedPtrType() {
     // as the pointer type will always be the same.
     for (Type *T : Ts)
       if (T->isSized())
-        Result.push_back(UndefValue::get(PointerType::getUnqual(T)));
+        Result.push_back(PoisonValue::get(PointerType::getUnqual(T)));
 
     return Result;
   };
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 19d351456d658..65f9810776024 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -225,7 +225,9 @@ class StructType : public Type {
     SCDB_IsLiteral = 4,
     SCDB_IsSized = 8,
     SCDB_ContainsScalableVector = 16,
-    SCDB_NotContainsScalableVector = 32
+    SCDB_NotContainsScalableVector = 32,
+    SCDB_ContainsNonGlobalTargetExtType = 64,
+    SCDB_NotContainsNonGlobalTargetExtType = 128,
   };
 
   /// For a named struct that actually has a name, this is a pointer to the
@@ -294,6 +296,12 @@ class StructType : public Type {
   bool isScalableTy(SmallPtrSetImpl<const Type *> &Visited) const;
   using Type::isScalableTy;
 
+  /// Return true if this type is or contains a target extension type that
+  /// disallows being used as a global.
+  bool
+  containsNonGlobalTargetExtType(SmallPtrSetImpl<const Type *> &Visited) const;
+  using Type::containsNonGlobalTargetExtType;
+
   /// Returns true if this struct contains homogeneous scalable vector types.
   /// Note that the definition of homogeneous scalable vector type is not
   /// recursive here. That means the following structure will return false
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 1c60eae7f2f85..e6332a16df7d5 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -935,43 +935,6 @@ class CmpInst : public Instruction {
     return isUnsigned(getPredicate());
   }
 
-  /// For example, ULT->SLT, ULE->SLE, UGT->SGT, UGE->SGE, SLT->Failed assert
-  /// @returns the signed version of the unsigned predicate pred.
-  /// return the signed version of a predicate
-  static Predicate getSignedPredicate(Predicate pred);
-
-  /// For example, ULT->SLT, ULE->SLE, UGT->SGT, UGE->SGE, SLT->Failed assert
-  /// @returns the signed version of the predicate for this instruction (which
-  /// has to be an unsigned predicate).
-  /// return the signed version of a predicate
-  Predicate getSignedPredicate() {
-    return getSignedPredicate(getPredicate());
-  }
-
-  /// For example, SLT->ULT, SLE->ULE, SGT->UGT, SGE->UGE, ULT->Failed assert
-  /// @returns the unsigned version of the signed predicate pred.
-  static Predicate getUnsignedPredicate(Predicate pred);
-
-  /// For example, SLT->ULT, SLE->ULE, SGT->UGT, SGE->UGE, ULT->Failed assert
-  /// @returns the unsigned version of the predicate for this instruction (which
-  /// has to be an signed predicate).
-  /// return the unsigned version of a predicate
-  Predicate getUnsignedPredicate() {
-    return getUnsignedPredicate(getPredicate());
-  }
-
-  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->Failed assert
-  /// @returns the unsigned version of the signed predicate pred or
-  ///          the signed version of the signed predicate pred.
-  static Predicate getFlippedSignednessPredicate(Predicate pred);
-
-  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->Failed assert
-  /// @returns the unsigned version of the signed predicate pred or
-  ///          the signed version of the signed predicate pred.
-  Predicate getFlippedSignednessPredicate() {
-    return getFlippedSignednessPredicate(getPredicate());
-  }
-
   /// This is just a convenience.
   /// Determine if this is true when both operands are the same.
   bool isTrueWhenEqual() const {
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 8eea659a00caf..605964af5d676 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1206,27 +1206,37 @@ class ICmpInst: public CmpInst {
   /// For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
   /// @returns the predicate that would be the result if the operand were
   /// regarded as signed.
-  /// Return the signed version of the predicate
+  /// Return the signed version of the predicate.
   Predicate getSignedPredicate() const {
     return getSignedPredicate(getPredicate());
   }
 
-  /// This is a static version that you can use without an instruction.
-  /// Return the signed version of the predicate.
+  /// Return the signed version of the predicate: static variant.
   static Predicate getSignedPredicate(Predicate pred);
 
   /// For example, EQ->EQ, SLE->ULE, UGT->UGT, etc.
   /// @returns the predicate that would be the result if the operand were
   /// regarded as unsigned.
-  /// Return the unsigned version of the predicate
+  /// Return the unsigned version of the predicate.
   Predicate getUnsignedPredicate() const {
     return getUnsignedPredicate(getPredicate());
   }
 
-  /// This is a static version that you can use without an instruction.
-  /// Return the unsigned version of the predicate.
+  /// Return the unsigned version of the predicate: static variant.
   static Predicate getUnsignedPredicate(Predicate pred);
 
+  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->Failed assert
+  /// @returns the unsigned version of the signed predicate pred or
+  ///          the signed version of the signed predicate pred.
+  static Predicate getFlippedSignednessPredicate(Predicate pred);
+
+  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->Failed assert
+  /// @returns the unsigned version of the signed predicate pred or
+  ///          the signed version of the signed predicate pred.
+  Predicate getFlippedSignednessPredicate() const {
+    return getFlippedSignednessPredicate(getPredicate());
+  }
+
   void setSameSign(bool B = true) {
     SubclassOptionalData = (SubclassOptionalData & ~SameSign) | (B * SameSign);
   }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4829453ee57cd..360af786c5160 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -594,6 +594,10 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
 
+def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
+  [llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]
+>, ClangBuiltin<"__builtin_amdgcn_prng_b32">;
+
 } // TargetPrefix = "amdgcn"
 
 // New-style image intrinsics
@@ -1670,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty,             // rsrc(SGPR)
    LLVMQualPointerType<3>,    // LDS base offset
-   llvm_i32_ty,               // Data byte size: 1/2/4
+   llvm_i32_ty,               // Data byte size: 1/2/4 (/12/16 for gfx950)
    llvm_i32_ty,               // voffset(VGPR, included in bounds checking and swizzling)
    llvm_i32_ty,               // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty,               // imm offset(imm, included in bounds checking and swizzling)
@@ -1689,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,        // rsrc(SGPR)
    LLVMQualPointerType<3>,    // LDS base offset
-   llvm_i32_ty,               // Data byte size: 1/2/4
+   llvm_i32_ty,               // Data byte size: 1/2/4 (/12/16 for gfx950)
    llvm_i32_ty,               // voffset(VGPR, included in bounds checking and swizzling)
    llvm_i32_ty,               // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty,               // imm offset(imm, included in bounds checking and swizzling)
@@ -1711,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
   [],
   [llvm_v4i32_ty,             // rsrc(SGPR)
    LLVMQualPointerType<3>,    // LDS base offset
-   llvm_i32_ty,               // Data byte size: 1/2/4
+   llvm_i32_ty,               // Data byte size: 1/2/4 (/12/16 for gfx950)
    llvm_i32_ty,               // vindex(VGPR)
    llvm_i32_ty,               // voffset(VGPR, included in bounds checking and swizzling)
    llvm_i32_ty,               // soffset(SGPR/imm, excluded from bounds checking and swizzling)
@@ -1731,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,        // rsrc(SGPR)
    LLVMQualPointerType<3>,    // LDS base offset
-   llvm_i32_ty,               // Data byte size: 1/2/4
+   llvm_i32_ty,               // Data byte size: 1/2/4 (/12/16 for gfx950)
    llvm_i32_ty,               // vindex(VGPR)
    llvm_i32_ty,               // voffset(VGPR, included in bounds checking and swizzling)
    llvm_i32_ty,               // soffset(SGPR/imm, excluded from bounds checking and swizzling)
@@ -2448,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
     [],
     [LLVMQualPointerType<1>,            // Base global pointer to load from
      LLVMQualPointerType<3>,            // LDS base pointer to store to
-     llvm_i32_ty,                       // Data byte size: 1/2/4
+     llvm_i32_ty,                       // Data byte size: 1/2/4 (/12/16 for gfx950)
      llvm_i32_ty,                       // imm offset (applied to both global and LDS address)
      llvm_i32_ty],                      // auxiliary data (imm, cachepolicy (bit 0 = sc0,
                                         //                                   bit 1 = sc1,
@@ -3106,6 +3110,17 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
             [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
+//===----------------------------------------------------------------------===//
+// gfx950 intrinsics
+//===----------------------------------------------------------------------===//
+
+defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
+def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
+def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
+
+def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
+}
+
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
 // should emit calls to these.
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 50def0eaf7886..39c60229aa1d8 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1463,7 +1463,7 @@ class ModuleSummaryIndex {
   // in the way some record are interpreted, like flags for instance.
   // Note that incrementing this may require changes in both BitcodeReader.cpp
   // and BitcodeWriter.cpp.
-  static constexpr uint64_t BitcodeSummaryVersion = 11;
+  static constexpr uint64_t BitcodeSummaryVersion = 12;
 
   // Regular LTO module name for ASM writer
   static constexpr const char *getRegularLTOModuleName() {
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index d269221fac070..5dab9d0d0a797 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -398,6 +398,11 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
     AnalysisResultLists.clear();
   }
 
+  /// Returns true if the specified analysis pass is registered.
+  template <typename PassT> bool isPassRegistered() const {
+    return AnalysisPasses.count(PassT::ID());
+  }
+
   /// Get the result of an analysis pass for a given IR unit.
   ///
   /// Runs the analysis if a cached result is not available.
@@ -458,10 +463,9 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
   /// and this function returns true.
   ///
   /// (Note: Although the return value of this function indicates whether or not
-  /// an analysis was previously registered, there intentionally isn't a way to
-  /// query this directly.  Instead, you should just register all the analyses
-  /// you might want and let this class run them lazily.  This idiom lets us
-  /// minimize the number of times we have to look up analyses in our
+  /// an analysis was previously registered, you should just register all the
+  /// analyses you might want and let this class run them lazily.  This idiom
+  /// lets us minimize the number of times we have to look up analyses in our
   /// hashtable.)
   template <typename PassBuilderT>
   bool registerPass(PassBuilderT &&PassBuilder) {
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 7fa940ab347af..000fdee45bb86 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -209,6 +209,12 @@ class Type {
   bool isScalableTy(SmallPtrSetImpl<const Type *> &Visited) const;
   bool isScalableTy() const;
 
+  /// Return true if this type is or contains a target extension type that
+  /// disallows being used as a global.
+  bool
+  containsNonGlobalTargetExtType(SmallPtrSetImpl<const Type *> &Visited) const;
+  bool containsNonGlobalTargetExtType() const;
+
   /// Return true if this is a FP type or a vector of FP.
   bool isFPOrFPVectorTy() const { return getScalarType()->isFloatingPointTy(); }
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 199e565bead04..fa30926c66258 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -130,6 +130,10 @@ class InstrProfWriter {
                            const llvm::SmallVector<memprof::FrameId> &CallStack,
                            function_ref<void(Error)> Warn);
 
+  /// Add the entire MemProfData \p Incoming to the writer context.
+  bool addMemProfData(memprof::IndexedMemProfData Incoming,
+                      function_ref<void(Error)> Warn);
+
   // Add a binary id to the binary ids list.
   void addBinaryIds(ArrayRef<llvm::object::BuildID> BIs);
 
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 6a29e3df9629b..f97fbd4bd6441 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -354,10 +354,15 @@ struct IndexedAllocationInfo {
   PortableMemInfoBlock Info;
 
   IndexedAllocationInfo() = default;
+  // This constructor is soft deprecated.  It will be removed once we remove all
+  // users of the CallStack field.
   IndexedAllocationInfo(ArrayRef<FrameId> CS, CallStackId CSId,
                         const MemInfoBlock &MB,
                         const MemProfSchema &Schema = getFullSchema())
       : CallStack(CS), CSId(CSId), Info(MB, Schema) {}
+  IndexedAllocationInfo(CallStackId CSId, const MemInfoBlock &MB,
+                        const MemProfSchema &Schema = getFullSchema())
+      : CSId(CSId), Info(MB, Schema) {}
 
   // Returns the size in bytes when this allocation info struct is serialized.
   size_t serializedSize(const MemProfSchema &Schema,
@@ -1045,8 +1050,9 @@ struct FrameStat {
 };
 
 // Compute a histogram of Frames in call stacks.
-llvm::DenseMap<FrameId, FrameStat>
-computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+template <typename FrameIdTy>
+llvm::DenseMap<FrameIdTy, FrameStat>
+computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
                           &MemProfCallStackData);
 
 // Construct a radix tree of call stacks.
@@ -1104,7 +1110,7 @@ computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
 // On-disk IndexedMemProfRecord will refer to call stacks by their indexes into
 // the radix tree array, so we do not explicitly encode mappings like:
 // "CallStackId 1 -> 11".
-class CallStackRadixTreeBuilder {
+template <typename FrameIdTy> class CallStackRadixTreeBuilder {
   // The radix tree array.
   std::vector<LinearFrameId> RadixArray;
 
@@ -1131,23 +1137,25 @@ class CallStackRadixTreeBuilder {
   // RadixArray[Indexes[5 - 1]] is the last frame of the common prefix.
   std::vector<LinearCallStackId> Indexes;
 
-  using CSIdPair = std::pair<CallStackId, llvm::SmallVector<FrameId>>;
+  using CSIdPair = std::pair<CallStackId, llvm::SmallVector<FrameIdTy>>;
 
   // Encode a call stack into RadixArray.  Return the starting index within
   // RadixArray.
-  LinearCallStackId encodeCallStack(
-      const llvm::SmallVector<FrameId> *CallStack,
-      const llvm::SmallVector<FrameId> *Prev,
-      const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes);
+  LinearCallStackId
+  encodeCallStack(const llvm::SmallVector<FrameIdTy> *CallStack,
+                  const llvm::SmallVector<FrameIdTy> *Prev,
+                  std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
+                      MemProfFrameIndexes);
 
 public:
   CallStackRadixTreeBuilder() = default;
 
   // Build a radix tree array.
-  void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+  void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
                  &&MemProfCallStackData,
-             const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
-             llvm::DenseMap<FrameId, FrameStat> &FrameHistogram);
+             std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
+                 MemProfFrameIndexes,
+             llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram);
 
   ArrayRef<LinearFrameId> getRadixArray() const { return RadixArray; }
 
diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h
index da2f14b276ffb..57ddcbf350060 100644
--- a/llvm/include/llvm/ProfileData/MemProfReader.h
+++ b/llvm/include/llvm/ProfileData/MemProfReader.h
@@ -47,22 +47,43 @@ class MemProfReader {
   }
 
   // Return a const reference to the internal Id to Frame mappings.
+  LLVM_DEPRECATED("Use takeMemProfData instead", "takeMemProfData")
   const llvm::DenseMap<FrameId, Frame> &getFrameMapping() const {
     return IdToFrame;
   }
 
   // Return a const reference to the internal Id to call stacks.
+  LLVM_DEPRECATED("Use takeMemProfData instead", "takeMemProfData")
   const llvm::DenseMap<CallStackId, llvm::SmallVector<FrameId>> &
   getCallStacks() const {
     return CSIdToCallStack;
   }
 
   // Return a const reference to the internal function profile data.
+  LLVM_DEPRECATED("Use takeMemProfData instead", "takeMemProfData")
   const llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> &
   getProfileData() const {
     return FunctionProfileData;
   }
 
+  // Take the complete profile data.
+  IndexedMemProfData takeMemProfData() {
+    // TODO: Once we replace the three member variables, namely IdToFrame,
+    // CSIdToCallStack, and FunctionProfileData, with MemProfData, replace the
+    // following code with just "return std::move(MemProfData);".
+    IndexedMemProfData MemProfData;
+    // Copy key-value pairs because IdToFrame uses DenseMap, whereas
+    // IndexedMemProfData::Frames uses MapVector.
+    for (const auto &[FrameId, F] : IdToFrame)
+      MemProfData.Frames.try_emplace(FrameId, F);
+    // Copy key-value pairs because CSIdToCallStack uses DenseMap, whereas
+    // IndexedMemProfData::CallStacks uses MapVector.
+    for (const auto &[CSId, CS] : CSIdToCallStack)
+      MemProfData.CallStacks.try_emplace(CSId, CS);
+    MemProfData.Records = FunctionProfileData;
+    return MemProfData;
+  }
+
   virtual Error
   readNextRecord(GuidMemProfRecordPair &GuidRecord,
                  std::function<const Frame(const FrameId)> Callback = nullptr) {
@@ -96,12 +117,16 @@ class MemProfReader {
   virtual ~MemProfReader() = default;
 
   // Initialize the MemProfReader with the frame mappings and profile contents.
+  LLVM_DEPRECATED("Construct MemProfReader with IndexedMemProfData",
+                  "MemProfReader")
   MemProfReader(
       llvm::DenseMap<FrameId, Frame> FrameIdMap,
       llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> ProfData);
 
   // Initialize the MemProfReader with the frame mappings, call stack mappings,
   // and profile contents.
+  LLVM_DEPRECATED("Construct MemProfReader with IndexedMemProfData",
+                  "MemProfReader")
   MemProfReader(
       llvm::DenseMap<FrameId, Frame> FrameIdMap,
       llvm::DenseMap<CallStackId, llvm::SmallVector<FrameId>> CSIdMap,
@@ -109,6 +134,15 @@ class MemProfReader {
       : IdToFrame(std::move(FrameIdMap)), CSIdToCallStack(std::move(CSIdMap)),
         FunctionProfileData(std::move(ProfData)) {}
 
+  // Initialize the MemProfReader with the given MemProf profile.
+  MemProfReader(IndexedMemProfData MemProfData) {
+    for (const auto &[FrameId, F] : MemProfData.Frames)
+      IdToFrame.try_emplace(FrameId, F);
+    for (const auto &[CSId, CS] : MemProfData.CallStacks)
+      CSIdToCallStack.try_emplace(CSId, CS);
+    FunctionProfileData = std::move(MemProfData.Records);
+  }
+
 protected:
   // A helper method to extract the frame from the IdToFrame map.
   const Frame &idToFrame(const FrameId Id) const {
diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h
index f2056de87cb94..b0d6f8335d9e0 100644
--- a/llvm/include/llvm/SandboxIR/Context.h
+++ b/llvm/include/llvm/SandboxIR/Context.h
@@ -44,11 +44,12 @@ class Context {
 
 protected:
   LLVMContext &LLVMCtx;
-  friend class Type;        // For LLVMCtx.
-  friend class PointerType; // For LLVMCtx.
-  friend class IntegerType; // For LLVMCtx.
-  friend class StructType;  // For LLVMCtx.
-  friend class Region;      // For LLVMCtx.
+  friend class Type;              // For LLVMCtx.
+  friend class PointerType;       // For LLVMCtx.
+  friend class IntegerType;       // For LLVMCtx.
+  friend class StructType;        // For LLVMCtx.
+  friend class Region;            // For LLVMCtx.
+  friend class IRSnapshotChecker; // To snapshot LLVMModuleToModuleMap.
 
   Tracker IRTracker;
 
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index d9642365908d2..d9b0e93725813 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -11,6 +11,7 @@
 
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/SandboxIR/BasicBlock.h"
 #include "llvm/SandboxIR/Constant.h"
@@ -2501,9 +2502,6 @@ class CmpInst : public SingleLLVMInstructionImpl<llvm::CmpInst> {
   WRAP_BOTH(isEquality);
   WRAP_BOTH(isRelational);
   WRAP_BOTH(isSigned);
-  WRAP_BOTH(getSignedPredicate);
-  WRAP_BOTH(getUnsignedPredicate);
-  WRAP_BOTH(getFlippedSignednessPredicate);
   WRAP_BOTH(isTrueWhenEqual);
   WRAP_BOTH(isFalseWhenEqual);
   WRAP_BOTH(isUnsigned);
@@ -2544,6 +2542,7 @@ class ICmpInst : public CmpInst {
 
   WRAP_BOTH(getSignedPredicate);
   WRAP_BOTH(getUnsignedPredicate);
+  WRAP_BOTH(getFlippedSignednessPredicate);
   WRAP_BOTH(isEquality);
   WRAP_MEMBER(isCommutative);
   WRAP_MEMBER(isRelational);
diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h
index dab20eb809ba0..9a031f3270837 100644
--- a/llvm/include/llvm/SandboxIR/Tracker.h
+++ b/llvm/include/llvm/SandboxIR/Tracker.h
@@ -42,13 +42,12 @@
 
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StableHashing.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/Module.h"
 #include "llvm/SandboxIR/Use.h"
 #include "llvm/Support/Debug.h"
 #include <memory>
-#include <regex>
 
 namespace llvm::sandboxir {
 
@@ -64,9 +63,56 @@ class SwitchInst;
 class ConstantInt;
 class ShuffleVectorInst;
 class CmpInst;
-class Module;
 class GlobalVariable;
 
+#ifndef NDEBUG
+
+/// A class that saves hashes and textual IR snapshots of functions in a
+/// SandboxIR Context, and does hash comparison when `expectNoDiff` is called.
+/// If hashes differ, it prints textual IR for both old and new versions to
+/// aid debugging.
+///
+/// This is used as an additional debug check when reverting changes to
+/// SandboxIR, to verify the reverted state matches the initial state.
+class IRSnapshotChecker {
+  Context &Ctx;
+
+  // A snapshot of textual IR for a function, with a hash for quick comparison.
+  struct FunctionSnapshot {
+    llvm::stable_hash Hash;
+    std::string TextualIR;
+  };
+
+  // A snapshot for each llvm::Function found in every module in the SandboxIR
+  // Context. In practice there will always be one module, but sandbox IR
+  // save/restore ops work at the Context level, so we must take the full state
+  // into account.
+  using ContextSnapshot = DenseMap<const llvm::Function *, FunctionSnapshot>;
+
+  ContextSnapshot OrigContextSnapshot;
+
+  // Dumps to a string the textual IR for a single Function.
+  std::string dumpIR(const llvm::Function &F) const;
+
+  // Returns a snapshot of all the modules in the sandbox IR context.
+  ContextSnapshot takeSnapshot() const;
+
+  // Compares two snapshots and returns true if they differ.
+  bool diff(const ContextSnapshot &Orig, const ContextSnapshot &Curr) const;
+
+public:
+  IRSnapshotChecker(Context &Ctx) : Ctx(Ctx) {}
+
+  /// Saves a snapshot of the current state. If there was any previous snapshot,
+  /// it will be replaced with the new one.
+  void save();
+
+  /// Checks current state against saved state, crashes if different.
+  void expectNoDiff();
+};
+
+#endif // NDEBUG
+
 /// The base class for IR Change classes.
 class IRChangeBase {
 protected:
@@ -405,6 +451,10 @@ class Tracker {
   TrackerState State = TrackerState::Disabled;
   Context &Ctx;
 
+#ifndef NDEBUG
+  IRSnapshotChecker SnapshotChecker;
+#endif
+
 public:
 #ifndef NDEBUG
   /// Helps catch bugs where we are creating new change objects while in the
@@ -412,7 +462,15 @@ class Tracker {
   bool InMiddleOfCreatingChange = false;
 #endif // NDEBUG
 
-  explicit Tracker(Context &Ctx) : Ctx(Ctx) {}
+  explicit Tracker(Context &Ctx)
+      : Ctx(Ctx)
+#ifndef NDEBUG
+        ,
+        SnapshotChecker(Ctx)
+#endif
+  {
+  }
+
   ~Tracker();
   Context &getContext() const { return Ctx; }
   /// Record \p Change and take ownership. This is the main function used to
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index f8379609bf1d9..b0c63fc7c7b80 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1385,6 +1385,12 @@ def match_addos : GICombineRule<
         [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
+def match_subo_no_overflow : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SSUBO, G_USUBO):$root,
+        [{ return Helper.matchSuboCarryOut(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 def match_extract_of_element_undef_vector: GICombineRule <
   (defs root:$root),
   (match (G_IMPLICIT_DEF $vector),
@@ -1901,6 +1907,12 @@ def cmp_combines: GICombineGroup<[
   redundant_binop_in_equality
 ]>;
 
+
+def overflow_combines: GICombineGroup<[
+  match_addos,
+  match_subo_no_overflow
+]>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1984,9 +1996,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
-    combine_concat_vector, match_addos,
+    combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
-    combine_use_vector_truncate, merge_combines]>;
+    combine_use_vector_truncate, merge_combines, overflow_combines]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index c6db4dfd7f515..55e7b417428c4 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -86,18 +86,19 @@ enum GPUKind : uint32_t {
   GK_GFX940 = 68,
   GK_GFX941 = 69,
   GK_GFX942 = 70,
-
-  GK_GFX1010 = 71,
-  GK_GFX1011 = 72,
-  GK_GFX1012 = 73,
-  GK_GFX1013 = 74,
-  GK_GFX1030 = 75,
-  GK_GFX1031 = 76,
-  GK_GFX1032 = 77,
-  GK_GFX1033 = 78,
-  GK_GFX1034 = 79,
-  GK_GFX1035 = 80,
-  GK_GFX1036 = 81,
+  GK_GFX950 = 71,
+
+  GK_GFX1010 = 72,
+  GK_GFX1011 = 73,
+  GK_GFX1012 = 74,
+  GK_GFX1013 = 75,
+  GK_GFX1030 = 76,
+  GK_GFX1031 = 77,
+  GK_GFX1032 = 78,
+  GK_GFX1033 = 79,
+  GK_GFX1034 = 80,
+  GK_GFX1035 = 81,
+  GK_GFX1036 = 82,
 
   GK_GFX1100 = 90,
   GK_GFX1101 = 91,
diff --git a/llvm/include/llvm/Transforms/Coroutines/ABI.h b/llvm/include/llvm/Transforms/Coroutines/ABI.h
index 8b83c5308056e..0b2d405f3caec 100644
--- a/llvm/include/llvm/Transforms/Coroutines/ABI.h
+++ b/llvm/include/llvm/Transforms/Coroutines/ABI.h
@@ -41,7 +41,7 @@ class BaseABI {
 public:
   BaseABI(Function &F, coro::Shape &S,
           std::function<bool(Instruction &)> IsMaterializable)
-      : F(F), Shape(S), IsMaterializable(IsMaterializable) {}
+      : F(F), Shape(S), IsMaterializable(std::move(IsMaterializable)) {}
   virtual ~BaseABI() = default;
 
   // Initialize the coroutine ABI
@@ -67,7 +67,7 @@ class SwitchABI : public BaseABI {
 public:
   SwitchABI(Function &F, coro::Shape &S,
             std::function<bool(Instruction &)> IsMaterializable)
-      : BaseABI(F, S, IsMaterializable) {}
+      : BaseABI(F, S, std::move(IsMaterializable)) {}
 
   void init() override;
 
@@ -80,7 +80,7 @@ class AsyncABI : public BaseABI {
 public:
   AsyncABI(Function &F, coro::Shape &S,
            std::function<bool(Instruction &)> IsMaterializable)
-      : BaseABI(F, S, IsMaterializable) {}
+      : BaseABI(F, S, std::move(IsMaterializable)) {}
 
   void init() override;
 
@@ -93,7 +93,7 @@ class AnyRetconABI : public BaseABI {
 public:
   AnyRetconABI(Function &F, coro::Shape &S,
                std::function<bool(Instruction &)> IsMaterializable)
-      : BaseABI(F, S, IsMaterializable) {}
+      : BaseABI(F, S, std::move(IsMaterializable)) {}
 
   void init() override;
 
diff --git a/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
index a197a2687ed02..344c9215fb822 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
@@ -16,8 +16,11 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/ProfileData/MemProf.h"
 
+#include <unordered_map>
+
 namespace llvm {
 class Function;
+class IndexedInstrProfReader;
 class Module;
 class TargetLibraryInfo;
 
@@ -66,6 +69,21 @@ namespace memprof {
 DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>>
 extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI);
 
+struct LineLocationHash {
+  uint64_t operator()(const LineLocation &Loc) const {
+    return Loc.getHashCode();
+  }
+};
+
+using LocToLocMap =
+    std::unordered_map<LineLocation, LineLocation, LineLocationHash>;
+
+// Compute an undrifting map.  The result is a map from caller GUIDs to an inner
+// map that maps source locations in the profile to those in the current IR.
+DenseMap<uint64_t, LocToLocMap>
+computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader,
+                  const TargetLibraryInfo &TLI);
+
 } // namespace memprof
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index e5f78ac228683..023c9de28209c 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -26,7 +26,7 @@ class AssumptionCache;
 class CallBase;
 class CallInst;
 class DominatorTree;
-class EarliestEscapeInfo;
+class EarliestEscapeAnalysis;
 class Function;
 class Instruction;
 class LoadInst;
@@ -49,7 +49,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   PostDominatorTree *PDT = nullptr;
   MemorySSA *MSSA = nullptr;
   MemorySSAUpdater *MSSAU = nullptr;
-  EarliestEscapeInfo *EEI = nullptr;
+  EarliestEscapeAnalysis *EEA = nullptr;
 
 public:
   MemCpyOptPass() = default;
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index 1e8ef0102450e..049d68b8a3068 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -207,6 +207,20 @@ void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                const char *NameSuffix = "",
                                ClonedCodeInfo *CodeInfo = nullptr);
 
+/// Collect debug information such as types, compile units, and other
+/// subprograms that are reachable from \p F and can be considered global for
+/// the purposes of cloning (and hence not needing to be cloned).
+///
+/// What debug information should be processed depends on \p Changes: when
+/// cloning into the same module we process \p F's subprogram and instructions;
+/// when into a cloned module, neither of those.
+///
+/// Returns DISubprogram of the cloned function when cloning into the same
+/// module or nullptr otherwise.
+DISubprogram *CollectDebugInfoForCloning(const Function &F,
+                                         CloneFunctionChangeType Changes,
+                                         DebugInfoFinder &DIFinder);
+
 /// This class captures the data input to the InlineFunction call, and records
 /// the auxiliary results produced by it.
 class InlineFunctionInfo {
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index 5211c7922ea2f..68a2daca1403d 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -117,7 +117,7 @@ class DGNode {
     assert(!isMemDepNodeCandidate(I) && "Expected Non-Mem instruction, ");
   }
   DGNode(const DGNode &Other) = delete;
-  virtual ~DGNode() = default;
+  virtual ~DGNode();
   /// \Returns the number of unscheduled successors.
   unsigned getNumUnscheduledSuccs() const { return UnscheduledSuccs; }
   void decrUnscheduledSuccs() {
@@ -290,6 +290,10 @@ class DependencyGraph {
   /// The DAG spans across all instructions in this interval.
   Interval<Instruction> DAGInterval;
 
+  Context *Ctx = nullptr;
+  std::optional<Context::CallbackID> CreateInstrCB;
+  std::optional<Context::CallbackID> EraseInstrCB;
+
   std::unique_ptr<BatchAAResults> BatchAA;
 
   enum class DependencyType {
@@ -325,9 +329,34 @@ class DependencyGraph {
   /// chain.
   void createNewNodes(const Interval<Instruction> &NewInterval);
 
+  /// Called by the callbacks when a new instruction \p I has been created.
+  void notifyCreateInstr(Instruction *I) {
+    getOrCreateNode(I);
+    // TODO: Update the dependencies for the new node.
+    // TODO: Update the MemDGNode chain to include the new node if needed.
+  }
+  /// Called by the callbacks when instruction \p I is about to get deleted.
+  void notifyEraseInstr(Instruction *I) {
+    InstrToNodeMap.erase(I);
+    // TODO: Update the dependencies.
+    // TODO: Update the MemDGNode chain to remove the node if needed.
+  }
+
 public:
-  DependencyGraph(AAResults &AA)
-      : BatchAA(std::make_unique<BatchAAResults>(AA)) {}
+  /// This constructor also registers callbacks.
+  DependencyGraph(AAResults &AA, Context &Ctx)
+      : Ctx(&Ctx), BatchAA(std::make_unique<BatchAAResults>(AA)) {
+    CreateInstrCB = Ctx.registerCreateInstrCallback(
+        [this](Instruction *I) { notifyCreateInstr(I); });
+    EraseInstrCB = Ctx.registerEraseInstrCallback(
+        [this](Instruction *I) { notifyEraseInstr(I); });
+  }
+  ~DependencyGraph() {
+    if (CreateInstrCB)
+      Ctx->unregisterCreateInstrCallback(*CreateInstrCB);
+    if (EraseInstrCB)
+      Ctx->unregisterEraseInstrCallback(*EraseInstrCB);
+  }
 
   DGNode *getNode(Instruction *I) const {
     auto It = InstrToNodeMap.find(I);
@@ -354,11 +383,6 @@ class DependencyGraph {
   Interval<Instruction> extend(ArrayRef<Instruction *> Instrs);
   /// \Returns the range of instructions included in the DAG.
   Interval<Instruction> getInterval() const { return DAGInterval; }
-  /// Called by the scheduler when a new instruction \p I has been created.
-  void notifyCreateInstr(Instruction *I) {
-    getOrCreateNode(I);
-    // TODO: Update the dependencies for the new node.
-  }
   void clear() {
     InstrToNodeMap.clear();
     DAGInterval = {};
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index 6109db7161101..bd45634814b07 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -30,12 +30,17 @@ class BottomUpVec final : public FunctionPass {
   /// Creates and returns a vector instruction that replaces the instructions in
   /// \p Bndl. \p Operands are the already vectorized operands.
   Value *createVectorInstr(ArrayRef<Value *> Bndl, ArrayRef<Value *> Operands);
+  /// Erases all dead instructions from the dead instruction candidates
+  /// collected during vectorization.
   void tryEraseDeadInstrs();
+  /// Packs all elements of \p ToPack into a vector and returns that vector.
   Value *createPack(ArrayRef<Value *> ToPack);
+  /// Recursively try to vectorize \p Bndl and its operands.
   Value *vectorizeRec(ArrayRef<Value *> Bndl, unsigned Depth);
+  /// Entry point for vectorization starting from \p Seeds.
   bool tryVectorize(ArrayRef<Value *> Seeds);
 
-  // The PM containing the pipeline of region passes.
+  /// The PM containing the pipeline of region passes.
   RegionPassManager RPM;
 
 public:
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
index 9c11b5dbc1643..3959f84c601e0 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
@@ -69,6 +69,10 @@ class SchedBundle {
 private:
   ContainerTy Nodes;
 
+  /// Called by the DGNode destructor to avoid accessing freed memory.
+  void eraseFromBundle(DGNode *N) { Nodes.erase(find(Nodes, N)); }
+  friend DGNode::~DGNode(); // For eraseFromBundle().
+
 public:
   SchedBundle() = default;
   SchedBundle(ContainerTy &&Nodes) : Nodes(std::move(Nodes)) {
@@ -106,8 +110,6 @@ class Scheduler {
   std::optional<BasicBlock::iterator> ScheduleTopItOpt;
   // TODO: This is wasting memory in exchange for fast removal using a raw ptr.
   DenseMap<SchedBundle *, std::unique_ptr<SchedBundle>> Bndls;
-  Context &Ctx;
-  Context::CallbackID CreateInstrCB;
 
   /// \Returns a scheduling bundle containing \p Instrs.
   SchedBundle *createBundle(ArrayRef<Instruction *> Instrs);
@@ -137,11 +139,8 @@ class Scheduler {
   Scheduler &operator=(const Scheduler &) = delete;
 
 public:
-  Scheduler(AAResults &AA, Context &Ctx) : DAG(AA), Ctx(Ctx) {
-    CreateInstrCB = Ctx.registerCreateInstrCallback(
-        [this](Instruction *I) { DAG.notifyCreateInstr(I); });
-  }
-  ~Scheduler() { Ctx.unregisterCreateInstrCallback(CreateInstrCB); }
+  Scheduler(AAResults &AA, Context &Ctx) : DAG(AA, Ctx) {}
+  ~Scheduler() {}
 
   bool trySchedule(ArrayRef<Instruction *> Instrs);
 
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 1dcdad01f4c80..178ad863eb06a 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -191,13 +191,14 @@ static bool areBothVScale(const Value *V1, const Value *V2) {
 }
 
 //===----------------------------------------------------------------------===//
-// CaptureInfo implementations
+// CaptureAnalysis implementations
 //===----------------------------------------------------------------------===//
 
-CaptureInfo::~CaptureInfo() = default;
+CaptureAnalysis::~CaptureAnalysis() = default;
 
-bool SimpleCaptureInfo::isNotCapturedBefore(const Value *Object,
-                                            const Instruction *I, bool OrAt) {
+bool SimpleCaptureAnalysis::isNotCapturedBefore(const Value *Object,
+                                                const Instruction *I,
+                                                bool OrAt) {
   return isNonEscapingLocalObject(Object, &IsCapturedCache);
 }
 
@@ -209,8 +210,9 @@ static bool isNotInCycle(const Instruction *I, const DominatorTree *DT,
          !isPotentiallyReachableFromMany(Succs, BB, nullptr, DT, LI);
 }
 
-bool EarliestEscapeInfo::isNotCapturedBefore(const Value *Object,
-                                             const Instruction *I, bool OrAt) {
+bool EarliestEscapeAnalysis::isNotCapturedBefore(const Value *Object,
+                                                 const Instruction *I,
+                                                 bool OrAt) {
   if (!isIdentifiedFunctionLocal(Object))
     return false;
 
@@ -241,7 +243,7 @@ bool EarliestEscapeInfo::isNotCapturedBefore(const Value *Object,
   return !isPotentiallyReachable(Iter.first->second, I, nullptr, &DT, LI);
 }
 
-void EarliestEscapeInfo::removeInstruction(Instruction *I) {
+void EarliestEscapeAnalysis::removeInstruction(Instruction *I) {
   auto Iter = Inst2Obj.find(I);
   if (Iter != Inst2Obj.end()) {
     for (const Value *Obj : Iter->second)
@@ -946,7 +948,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   // Make sure the object has not escaped here, and then check that none of the
   // call arguments alias the object below.
   if (!isa<Constant>(Object) && Call != Object &&
-      AAQI.CI->isNotCapturedBefore(Object, Call, /*OrAt*/ false)) {
+      AAQI.CA->isNotCapturedBefore(Object, Call, /*OrAt*/ false)) {
 
     // Optimistically assume that call doesn't touch Object and check this
     // assumption in the following loop.
@@ -1621,10 +1623,10 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
     // temporary store the nocapture argument's value in a temporary memory
     // location if that memory location doesn't escape. Or it may pass a
     // nocapture value to other functions as long as they don't capture it.
-    if (isEscapeSource(O1) && AAQI.CI->isNotCapturedBefore(
+    if (isEscapeSource(O1) && AAQI.CA->isNotCapturedBefore(
                                   O2, dyn_cast<Instruction>(O1), /*OrAt*/ true))
       return AliasResult::NoAlias;
-    if (isEscapeSource(O2) && AAQI.CI->isNotCapturedBefore(
+    if (isEscapeSource(O2) && AAQI.CA->isNotCapturedBefore(
                                   O1, dyn_cast<Instruction>(O2), /*OrAt*/ true))
       return AliasResult::NoAlias;
   }
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index 45702fa25d8b1..12553dd446a61 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -199,13 +199,12 @@ void InlineAdvice::recordInliningWithCalleeDeleted() {
 
 AnalysisKey InlineAdvisorAnalysis::Key;
 AnalysisKey PluginInlineAdvisorAnalysis::Key;
-bool PluginInlineAdvisorAnalysis::HasBeenRegistered = false;
 
 bool InlineAdvisorAnalysis::Result::tryCreate(
     InlineParams Params, InliningAdvisorMode Mode,
     const ReplayInlinerSettings &ReplaySettings, InlineContext IC) {
   auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  if (PluginInlineAdvisorAnalysis::HasBeenRegistered) {
+  if (MAM.isPassRegistered<PluginInlineAdvisorAnalysis>()) {
     auto &DA = MAM.getResult<PluginInlineAdvisorAnalysis>(M);
     Advisor.reset(DA.Factory(M, FAM, Params, IC));
     return !!Advisor;
diff --git a/llvm/lib/Analysis/InlineOrder.cpp b/llvm/lib/Analysis/InlineOrder.cpp
index f156daa2f126f..8d920153f250d 100644
--- a/llvm/lib/Analysis/InlineOrder.cpp
+++ b/llvm/lib/Analysis/InlineOrder.cpp
@@ -283,7 +283,6 @@ class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> {
 } // namespace
 
 AnalysisKey llvm::PluginInlineOrderAnalysis::Key;
-bool llvm::PluginInlineOrderAnalysis::HasBeenRegistered;
 
 std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>>
 llvm::getDefaultInlineOrder(FunctionAnalysisManager &FAM,
@@ -313,7 +312,7 @@ llvm::getDefaultInlineOrder(FunctionAnalysisManager &FAM,
 std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>>
 llvm::getInlineOrder(FunctionAnalysisManager &FAM, const InlineParams &Params,
                      ModuleAnalysisManager &MAM, Module &M) {
-  if (llvm::PluginInlineOrderAnalysis::isRegistered()) {
+  if (MAM.isPassRegistered<PluginInlineOrderAnalysis>()) {
     LLVM_DEBUG(dbgs() << "    Current used priority: plugin ---- \n");
     return MAM.getResult<PluginInlineOrderAnalysis>(M).Factory(FAM, Params, MAM,
                                                                M);
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 93b601b22c3a3..01b0a089aab71 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -1857,27 +1857,31 @@ static Value *simplifyAndOrOfFCmps(const SimplifyQuery &Q, FCmpInst *LHS,
     return nullptr;
 
   FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  auto AbsOrSelfLHS0 = m_CombineOr(m_Specific(LHS0), m_FAbs(m_Specific(LHS0)));
   if ((PredL == FCmpInst::FCMP_ORD || PredL == FCmpInst::FCMP_UNO) &&
       ((FCmpInst::isOrdered(PredR) && IsAnd) ||
        (FCmpInst::isUnordered(PredR) && !IsAnd))) {
-    // (fcmp ord X, 0) & (fcmp o** X, Y) --> fcmp o** X, Y
-    // (fcmp uno X, 0) & (fcmp o** X, Y) --> false
-    // (fcmp uno X, 0) | (fcmp u** X, Y) --> fcmp u** X, Y
-    // (fcmp ord X, 0) | (fcmp u** X, Y) --> true
-    if ((LHS0 == RHS0 || LHS0 == RHS1) && match(LHS1, m_PosZeroFP()))
+    // (fcmp ord X, 0) & (fcmp o** X/abs(X), Y) --> fcmp o** X/abs(X), Y
+    // (fcmp uno X, 0) & (fcmp o** X/abs(X), Y) --> false
+    // (fcmp uno X, 0) | (fcmp u** X/abs(X), Y) --> fcmp u** X/abs(X), Y
+    // (fcmp ord X, 0) | (fcmp u** X/abs(X), Y) --> true
+    if ((match(RHS0, AbsOrSelfLHS0) || match(RHS1, AbsOrSelfLHS0)) &&
+        match(LHS1, m_PosZeroFP()))
       return FCmpInst::isOrdered(PredL) == FCmpInst::isOrdered(PredR)
                  ? static_cast<Value *>(RHS)
                  : ConstantInt::getBool(LHS->getType(), !IsAnd);
   }
 
+  auto AbsOrSelfRHS0 = m_CombineOr(m_Specific(RHS0), m_FAbs(m_Specific(RHS0)));
   if ((PredR == FCmpInst::FCMP_ORD || PredR == FCmpInst::FCMP_UNO) &&
       ((FCmpInst::isOrdered(PredL) && IsAnd) ||
        (FCmpInst::isUnordered(PredL) && !IsAnd))) {
-    // (fcmp o** X, Y) & (fcmp ord X, 0) --> fcmp o** X, Y
-    // (fcmp o** X, Y) & (fcmp uno X, 0) --> false
-    // (fcmp u** X, Y) | (fcmp uno X, 0) --> fcmp u** X, Y
-    // (fcmp u** X, Y) | (fcmp ord X, 0) --> true
-    if ((RHS0 == LHS0 || RHS0 == LHS1) && match(RHS1, m_PosZeroFP()))
+    // (fcmp o** X/abs(X), Y) & (fcmp ord X, 0) --> fcmp o** X/abs(X), Y
+    // (fcmp o** X/abs(X), Y) & (fcmp uno X, 0) --> false
+    // (fcmp u** X/abs(X), Y) | (fcmp uno X, 0) --> fcmp u** X/abs(X), Y
+    // (fcmp u** X/abs(X), Y) | (fcmp ord X, 0) --> true
+    if ((match(LHS0, AbsOrSelfRHS0) || match(LHS1, AbsOrSelfRHS0)) &&
+        match(RHS1, m_PosZeroFP()))
       return FCmpInst::isOrdered(PredL) == FCmpInst::isOrdered(PredR)
                  ? static_cast<Value *>(LHS)
                  : ConstantInt::getBool(LHS->getType(), !IsAnd);
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index c5fba184cd085..c40bbd9e18e79 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -268,7 +268,7 @@ MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
 MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
     BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) {
-  BatchAAResults BatchAA(AA, &EII);
+  BatchAAResults BatchAA(AA, &EEA);
   return getPointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst, Limit,
                                   BatchAA);
 }
@@ -1198,7 +1198,7 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
   bool GotWorklistLimit = false;
   LLVM_DEBUG(AssertSorted(*Cache));
 
-  BatchAAResults BatchAA(AA, &EII);
+  BatchAAResults BatchAA(AA, &EEA);
   while (!Worklist.empty()) {
     BasicBlock *BB = Worklist.pop_back_val();
 
@@ -1510,7 +1510,7 @@ void MemoryDependenceResults::invalidateCachedPredecessors() {
 }
 
 void MemoryDependenceResults::removeInstruction(Instruction *RemInst) {
-  EII.removeInstruction(RemInst);
+  EEA.removeInstruction(RemInst);
 
   // Walk through the Non-local dependencies, removing this one as the value
   // for any cached queries.
diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index aa550f0b6a7bf..f672bd0e1e133 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -1403,9 +1403,11 @@ void MemorySSAUpdater::changeToUnreachable(const Instruction *I) {
 
 MemoryAccess *MemorySSAUpdater::createMemoryAccessInBB(
     Instruction *I, MemoryAccess *Definition, const BasicBlock *BB,
-    MemorySSA::InsertionPlace Point) {
-  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
-  MSSA->insertIntoListsForBlock(NewAccess, BB, Point);
+    MemorySSA::InsertionPlace Point, bool CreationMustSucceed) {
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(
+      I, Definition, /*Template=*/nullptr, CreationMustSucceed);
+  if (NewAccess)
+    MSSA->insertIntoListsForBlock(NewAccess, BB, Point);
   return NewAccess;
 }
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 46b108606f6a6..376f260846bba 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -11874,7 +11874,7 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
                                    CmpInst::Predicate P2) {
     assert(P1 != P2 && "Handled earlier!");
     return CmpInst::isRelational(P2) &&
-           P1 == CmpInst::getFlippedSignednessPredicate(P2);
+           P1 == ICmpInst::getFlippedSignednessPredicate(P2);
   };
   if (IsSignFlippedPredicate(Pred, FoundPred)) {
     // Unsigned comparison is the same as signed comparison when both the
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bc6a528c9dab3..174e5e87abe53 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -359,6 +359,10 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
   return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
 }
 
+unsigned TargetTransformInfo::getEpilogueVectorizationMinVF() const {
+  return TTIImpl->getEpilogueVectorizationMinVF();
+}
+
 bool TargetTransformInfo::preferPredicateOverEpilogue(
     TailFoldingInfo *TFI) const {
   return TTIImpl->preferPredicateOverEpilogue(TFI);
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 9ca76b54a88d9..3e6abacac2726 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7997,7 +7997,16 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     case bitc::FS_STACK_IDS: { // [n x stackid]
       // Save stack ids in the reader to consult when adding stack ids from the
       // lists in the stack node and alloc node entries.
-      StackIds = ArrayRef<uint64_t>(Record);
+      if (Version <= 11) {
+        StackIds = ArrayRef<uint64_t>(Record);
+        break;
+      }
+      // This is an array of 32-bit fixed-width values, holding each 64-bit
+      // context id as a pair of adjacent (most significant first) 32-bit words.
+      assert(Record.size() % 2 == 0);
+      StackIds.reserve(Record.size() / 2);
+      for (auto R = Record.begin(); R != Record.end(); R += 2)
+        StackIds.push_back(*R << 32 | *(R + 1));
       break;
     }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 5829af39cf5e2..80e12bef502ac 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4429,12 +4429,17 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS));
     // numids x stackid
     StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    // FIXME: The stack ids are hashes that are close to 64 bits in size, so
-    // emitting as a pair of 32-bit fixed-width values, as we do for context
-    // ids, would be more efficient.
-    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    // The stack ids are hashes that are close to 64 bits in size, so emitting
+    // as a pair of 32-bit fixed-width values is more efficient than a VBR.
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
     unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv));
-    Stream.EmitRecord(bitc::FS_STACK_IDS, Index->stackIds(), StackIdAbbvId);
+    SmallVector<uint32_t> Vals;
+    Vals.reserve(Index->stackIds().size() * 2);
+    for (auto Id : Index->stackIds()) {
+      Vals.push_back(static_cast<uint32_t>(Id >> 32));
+      Vals.push_back(static_cast<uint32_t>(Id));
+    }
+    Stream.EmitRecord(bitc::FS_STACK_IDS, Vals, StackIdAbbvId);
   }
 
   // n x context id
@@ -4624,9 +4629,17 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS));
     // numids x stackid
     StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    // The stack ids are hashes that are close to 64 bits in size, so emitting
+    // as a pair of 32-bit fixed-width values is more efficient than a VBR.
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
     unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv));
-    Stream.EmitRecord(bitc::FS_STACK_IDS, StackIds, StackIdAbbvId);
+    SmallVector<uint32_t> Vals;
+    Vals.reserve(StackIds.size() * 2);
+    for (auto Id : StackIds) {
+      Vals.push_back(static_cast<uint32_t>(Id >> 32));
+      Vals.push_back(static_cast<uint32_t>(Id));
+    }
+    Stream.EmitRecord(bitc::FS_STACK_IDS, Vals, StackIdAbbvId);
   }
 
   // Abbrev for FS_COMBINED_PROFILE.
@@ -5488,8 +5501,9 @@ void llvm::embedBitcodeInModule(llvm::Module &M, llvm::MemoryBufferRef Buf,
   // Save llvm.compiler.used and remove it.
   SmallVector<Constant *, 2> UsedArray;
   SmallVector<GlobalValue *, 4> UsedGlobals;
-  Type *UsedElementType = PointerType::getUnqual(M.getContext());
   GlobalVariable *Used = collectUsedGlobalVariables(M, UsedGlobals, true);
+  Type *UsedElementType = Used ? Used->getValueType()->getArrayElementType()
+                               : PointerType::getUnqual(M.getContext());
   for (auto *GV : UsedGlobals) {
     if (GV->getName() != "llvm.embedded.module" &&
         GV->getName() != "llvm.cmdline")
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 1d24fa44a5cea..ff9be3dc24ce5 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3849,6 +3849,8 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
                                    AsmPrinter &AP, const Constant *BaseCV,
                                    uint64_t Offset,
                                    AsmPrinter::AliasMapTy *AliasList) {
+  assert((!AliasList || AP.TM.getTargetTriple().isOSBinFormatXCOFF()) &&
+         "AliasList only expected for XCOFF");
   emitGlobalAliasInline(AP, Offset, AliasList);
   uint64_t Size = DL.getTypeAllocSize(CV->getType());
 
@@ -3858,7 +3860,27 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
   if (!BaseCV && CV->hasOneUse())
     BaseCV = dyn_cast<Constant>(CV->user_back());
 
-  if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
+  if (isa<ConstantAggregateZero>(CV)) {
+    StructType *structType;
+    if (AliasList && (structType = llvm::dyn_cast<StructType>(CV->getType()))) {
+      // Handle cases of aliases to direct struct elements
+      const StructLayout *Layout = DL.getStructLayout(structType);
+      uint64_t SizeSoFar = 0;
+      for (unsigned int i = 0, n = structType->getNumElements(); i < n - 1;
+           ++i) {
+        uint64_t GapToNext = Layout->getElementOffset(i + 1) - SizeSoFar;
+        AP.OutStreamer->emitZeros(GapToNext);
+        SizeSoFar += GapToNext;
+        emitGlobalAliasInline(AP, Offset + SizeSoFar, AliasList);
+      }
+      AP.OutStreamer->emitZeros(Size - SizeSoFar);
+      return;
+    } else {
+      return AP.OutStreamer->emitZeros(Size);
+    }
+  }
+
+  if (isa<UndefValue>(CV))
     return AP.OutStreamer->emitZeros(Size);
 
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
diff --git a/llvm/lib/CodeGen/CFIFixup.cpp b/llvm/lib/CodeGen/CFIFixup.cpp
index 0edffc0beef8e..02152a136a225 100644
--- a/llvm/lib/CodeGen/CFIFixup.cpp
+++ b/llvm/lib/CodeGen/CFIFixup.cpp
@@ -67,7 +67,12 @@
 
 #include "llvm/CodeGen/CFIFixup.h"
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -76,6 +81,8 @@
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/Target/TargetMachine.h"
 
+#include <iterator>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "cfi-fixup"
@@ -120,7 +127,7 @@ findPrologueEnd(MachineFunction &MF, MachineBasicBlock::iterator &PrologueEnd) {
 // iterator can point to the end of the block. Instructions are inserted
 // *before* the iterator.
 struct InsertionPoint {
-  MachineBasicBlock *MBB;
+  MachineBasicBlock *MBB = nullptr;
   MachineBasicBlock::iterator Iterator;
 };
 
@@ -150,6 +157,30 @@ insertRememberRestorePair(const InsertionPoint &RememberInsertPt,
                         ->getIterator())};
 }
 
+// Copies all CFI instructions before PrologueEnd and inserts them before
+// DstInsertPt. Returns the iterator to the first instruction after the
+// inserted instructions.
+static InsertionPoint cloneCfiPrologue(const InsertionPoint &PrologueEnd,
+                                       const InsertionPoint &DstInsertPt) {
+  MachineFunction &MF = *DstInsertPt.MBB->getParent();
+
+  auto cloneCfiInstructions = [&](MachineBasicBlock::iterator Begin,
+                                  MachineBasicBlock::iterator End) {
+    auto ToClone = map_range(
+        make_filter_range(make_range(Begin, End), isPrologueCFIInstruction),
+        [&](const MachineInstr &MI) { return MF.CloneMachineInstr(&MI); });
+    DstInsertPt.MBB->insert(DstInsertPt.Iterator, ToClone.begin(),
+                            ToClone.end());
+  };
+
+  // Clone all CFI instructions from previous blocks.
+  for (auto &MBB : make_range(MF.begin(), PrologueEnd.MBB->getIterator()))
+    cloneCfiInstructions(MBB.begin(), MBB.end());
+  // Clone all CFI instructions from the final prologue block.
+  cloneCfiInstructions(PrologueEnd.MBB->begin(), PrologueEnd.Iterator);
+  return DstInsertPt;
+}
+
 bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
   const TargetFrameLowering &TFL = *MF.getSubtarget().getFrameLowering();
   if (!TFL.enableCFIFixup(MF))
@@ -172,7 +203,8 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
     bool HasFrameOnEntry : 1;
     bool HasFrameOnExit : 1;
   };
-  SmallVector<BlockFlags, 32> BlockInfo(NumBlocks, {false, false, false, false});
+  SmallVector<BlockFlags, 32> BlockInfo(NumBlocks,
+                                        {false, false, false, false});
   BlockInfo[0].Reachable = true;
   BlockInfo[0].StrongNoFrameOnEntry = true;
 
@@ -209,10 +241,11 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
   // of the previous block. If the intended frame state is different, insert
   // compensating CFI instructions.
   bool Change = false;
-  // `InsertPt` always points to the point in a preceding block where we have to
-  // insert a `.cfi_remember_state`, in the case that the current block needs a
-  // `.cfi_restore_state`.
-  InsertionPoint InsertPt = {PrologueBlock, PrologueEnd};
+  // `InsertPt[sectionID]` always points to the point in a preceding block where
+  // we have to insert a `.cfi_remember_state`, in the case that the current
+  // block needs a `.cfi_restore_state`.
+  SmallDenseMap<MBBSectionID, InsertionPoint> InsertionPts;
+  InsertionPts[PrologueBlock->getSectionID()] = {PrologueBlock, PrologueEnd};
 
   assert(PrologueEnd != PrologueBlock->begin() &&
          "Inconsistent notion of \"prologue block\"");
@@ -239,14 +272,28 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
       }
     }
 #endif
+
+    // If the block is the first block in its section, then it doesn't have a
+    // frame on entry.
+    HasFrame &= !CurrBB->isBeginSection();
     if (!Info.StrongNoFrameOnEntry && Info.HasFrameOnEntry && !HasFrame) {
       // Reset to the "after prologue" state.
 
-      // There's an earlier block known to have a stack frame. Insert a
-      // `.cfi_remember_state` instruction into that block and a
-      // `.cfi_restore_state` instruction at the beginning of the current block.
-      InsertPt = insertRememberRestorePair(
-          InsertPt, InsertionPoint{&*CurrBB, CurrBB->begin()});
+      InsertionPoint &InsertPt = InsertionPts[CurrBB->getSectionID()];
+      if (InsertPt.MBB == nullptr) {
+        // CurBB is the first block in its section, so there is no "after
+        // prologue" state. Clone the CFI instructions from the prologue block
+        // to create it.
+        InsertPt = cloneCfiPrologue({PrologueBlock, PrologueEnd},
+                                    {&*CurrBB, CurrBB->begin()});
+      } else {
+        // There's an earlier block known to have a stack frame. Insert a
+        // `.cfi_remember_state` instruction into that block and a
+        // `.cfi_restore_state` instruction at the beginning of the current
+        // block.
+        InsertPt =
+            insertRememberRestorePair(InsertPt, {&*CurrBB, CurrBB->begin()});
+      }
       Change = true;
     } else if ((Info.StrongNoFrameOnEntry || !Info.HasFrameOnEntry) &&
                HasFrame) {
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 1de01e402e59e..a1acb4ef36838 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -686,7 +686,7 @@ Value *MemCmpExpansion::getMemCmpOneBlock() {
     }
     // Generate new code and remove the original memcmp call and the user
     if (ICmpInst::isSigned(Pred)) {
-      Value *Cmp = Builder.CreateICmp(CmpInst::getUnsignedPredicate(Pred),
+      Value *Cmp = Builder.CreateICmp(ICmpInst::getUnsignedPredicate(Pred),
                                       Loads.Lhs, Loads.Rhs);
       auto *Result = NeedsZExt ? Builder.CreateZExt(Cmp, UI->getType()) : Cmp;
       UI->replaceAllUsesWith(Result);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 83d78c0bde399..d95fc8cfbcf55 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7790,3 +7790,78 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,
 
   return true;
 }
+
+bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
+                                       BuildFnTy &MatchInfo) {
+  const GSubCarryOut *Subo = cast<GSubCarryOut>(&MI);
+
+  Register Dst = Subo->getReg(0);
+  Register LHS = Subo->getLHSReg();
+  Register RHS = Subo->getRHSReg();
+  Register Carry = Subo->getCarryOutReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT CarryTy = MRI.getType(Carry);
+
+  // Check legality before known bits.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy}}) ||
+      !isConstantLegalOrBeforeLegalizer(CarryTy))
+    return false;
+
+  ConstantRange KBLHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+                                   /* IsSigned=*/Subo->isSigned());
+  ConstantRange KBRHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+                                   /* IsSigned=*/Subo->isSigned());
+
+  if (Subo->isSigned()) {
+    // G_SSUBO
+    switch (KBLHS.signedSubMayOverflow(KBRHS)) {
+    case ConstantRange::OverflowResult::MayOverflow:
+      return false;
+    case ConstantRange::OverflowResult::NeverOverflows: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+        B.buildConstant(Carry, 0);
+      };
+      return true;
+    }
+    case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+    case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSub(Dst, LHS, RHS);
+        B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(),
+                                              /*isVector=*/CarryTy.isVector(),
+                                              /*isFP=*/false));
+      };
+      return true;
+    }
+    }
+    return false;
+  }
+
+  // G_USUBO
+  switch (KBLHS.unsignedSubMayOverflow(KBRHS)) {
+  case ConstantRange::OverflowResult::MayOverflow:
+    return false;
+  case ConstantRange::OverflowResult::NeverOverflows: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+  case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+  case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildSub(Dst, LHS, RHS);
+      B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(),
+                                            /*isVector=*/CarryTy.isVector(),
+                                            /*isFP=*/false));
+    };
+    return true;
+  }
+  }
+
+  return false;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 40d4a5250dfbb..a700d866afa4e 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -147,6 +147,15 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
   unsigned Opcode = MI.getOpcode();
   LLT DstTy = MRI.getType(R);
 
+  // Handle the case where this is called on a register that does not have a
+  // type constraint (i.e. it has a register class constraint instead). This is
+  // unlikely to occur except by looking through copies but it is possible for
+  // the initial register being queried to be in this state.
+  if (!DstTy.isValid()) {
+    Known = KnownBits();
+    return;
+  }
+
 #ifndef NDEBUG
   if (DstTy.isFixedVector()) {
     assert(
@@ -158,15 +167,6 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
   }
 #endif
 
-  // Handle the case where this is called on a register that does not have a
-  // type constraint (i.e. it has a register class constraint instead). This is
-  // unlikely to occur except by looking through copies but it is possible for
-  // the initial register being queried to be in this state.
-  if (!DstTy.isValid()) {
-    Known = KnownBits();
-    return;
-  }
-
   unsigned BitWidth = DstTy.getScalarSizeInBits();
   auto CacheEntry = ComputeKnownBitsCache.find(R);
   if (CacheEntry != ComputeKnownBitsCache.end()) {
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index bdad63f368dfe..0f68313e64f54 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3558,14 +3558,16 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
                             /*AfterPlacement=*/true)) {
-      // Redo the layout if tail merging creates/removes/moves blocks.
-      BlockToChain.clear();
-      ComputedEdges.clear();
       // Must redo the post-dominator tree if blocks were changed.
       if (MPDT)
         MPDT->recalculate(MF);
-      ChainAllocator.DestroyAll();
-      buildCFGChains();
+      if (!UseExtTspForSize) {
+        // Redo the layout if tail merging creates/removes/moves blocks.
+        BlockToChain.clear();
+        ComputedEdges.clear();
+        ChainAllocator.DestroyAll();
+        buildCFGChains();
+      }
     }
   }
 
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index c470bd71dfb29..0def107f6306d 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1227,7 +1227,8 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
       AllSuccs, [&](const MachineBasicBlock *L, const MachineBasicBlock *R) {
         uint64_t LHSFreq = MBFI ? MBFI->getBlockFreq(L).getFrequency() : 0;
         uint64_t RHSFreq = MBFI ? MBFI->getBlockFreq(R).getFrequency() : 0;
-        if (llvm::shouldOptimizeForSize(MBB, PSI, MBFI) || !LHSFreq || !RHSFreq)
+        if (llvm::shouldOptimizeForSize(MBB, PSI, MBFI) ||
+            (!LHSFreq && !RHSFreq))
           return CI->getCycleDepth(L) < CI->getCycleDepth(R);
         return LHSFreq < RHSFreq;
       });
diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp
index 2077da3cc7c5a..483e61db788f4 100644
--- a/llvm/lib/CodeGen/RDFGraph.cpp
+++ b/llvm/lib/CodeGen/RDFGraph.cpp
@@ -1515,15 +1515,13 @@ void DataFlowGraph::linkRefUp(Instr IA, NodeAddr<T> TA, DefStack &DS) {
   for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
     RegisterRef QR = I->Addr->getRegRef(*this);
 
-    // Skip all defs that are aliased to any of the defs that we have already
-    // seen. If this completes a cover of RR, stop the stack traversal.
-    bool Alias = Defs.hasAliasOf(QR);
-    bool Cover = Defs.insert(QR).hasCoverOf(RR);
-    if (Alias) {
-      if (Cover)
-        break;
+    // Skip all defs that we have already seen.
+    // If this completes a cover of RR, stop the stack traversal.
+    bool Seen = Defs.hasCoverOf(QR);
+    if (Seen)
       continue;
-    }
+
+    bool Cover = Defs.insert(QR).hasCoverOf(RR);
 
     // The reaching def.
     Def RDA = *I;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 2e1f498c090d1..073ce367af1b8 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1374,6 +1374,27 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   MachineInstr &NewMI = *std::prev(MII);
   NewMI.setDebugLoc(DL);
 
+  // In a situation like the following:
+  //
+  //    undef %2.subreg:reg = INST %1:reg         ; DefMI (rematerializable),
+  //                                              ; DefSubIdx = subreg
+  //    %3:reg = COPY %2                          ; SrcIdx = DstIdx = 0
+  //    .... = SOMEINSTR %3:reg
+  //
+  // there are no subranges for %3 so after rematerialization we need
+  // to explicitly create them. Undefined subranges are removed later on.
+  if (DstReg.isVirtual() && DefSubIdx && !CP.getSrcIdx() && !CP.getDstIdx() &&
+      MRI->shouldTrackSubRegLiveness(DstReg)) {
+    LiveInterval &DstInt = LIS->getInterval(DstReg);
+    if (!DstInt.hasSubRanges()) {
+      LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstReg);
+      LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(DefSubIdx);
+      LaneBitmask UnusedLanes = FullMask & ~UsedLanes;
+      DstInt.createSubRangeFrom(LIS->getVNInfoAllocator(), UsedLanes, DstInt);
+      DstInt.createSubRangeFrom(LIS->getVNInfoAllocator(), UnusedLanes, DstInt);
+    }
+  }
+
   // In a situation like the following:
   //     %0:subreg = instr              ; DefMI, subreg = DstIdx
   //     %1        = copy %0:subreg ; CopyMI, SrcIdx = 0
@@ -1486,6 +1507,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
         NewRC = TRI->getCommonSubClass(NewRC, DefRC);
       assert(NewRC && "subreg chosen for remat incompatible with instruction");
     }
+
     // Remap subranges to new lanemask and change register class.
     LiveInterval &DstInt = LIS->getInterval(DstReg);
     for (LiveInterval::SubRange &SR : DstInt.subranges()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 5abd7cb97bda5..3f8d117400efd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -791,28 +791,29 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFREXP(SDNode *N) {
   return ReturnVal;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FSINCOS(SDNode *N) {
-  assert(!N->isStrictFPOpcode() && "strictfp not implemented for fsincos");
+SDValue
+DAGTypeLegalizer::SoftenFloatRes_UnaryWithTwoFPResults(SDNode *N,
+                                                       RTLIB::Libcall LC) {
+  assert(!N->isStrictFPOpcode() && "strictfp not implemented");
   EVT VT = N->getValueType(0);
-  RTLIB::Libcall LC = RTLIB::getFSINCOS(VT);
 
   if (!TLI.getLibcallName(LC))
     return SDValue();
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  SDValue StackSlotSin = DAG.CreateStackTemporary(NVT);
-  SDValue StackSlotCos = DAG.CreateStackTemporary(NVT);
+  SDValue FirstResultSlot = DAG.CreateStackTemporary(NVT);
+  SDValue SecondResultSlot = DAG.CreateStackTemporary(NVT);
 
   SDLoc DL(N);
 
   TargetLowering::MakeLibCallOptions CallOptions;
-  std::array Ops{GetSoftenedFloat(N->getOperand(0)), StackSlotSin,
-                 StackSlotCos};
-  std::array OpsVT{VT, StackSlotSin.getValueType(),
-                   StackSlotCos.getValueType()};
+  std::array Ops{GetSoftenedFloat(N->getOperand(0)), FirstResultSlot,
+                 SecondResultSlot};
+  std::array OpsVT{VT, FirstResultSlot.getValueType(),
+                   SecondResultSlot.getValueType()};
 
   // TODO: setTypeListBeforeSoften can't properly express multiple return types,
-  // but since both returns have the same type for sincos it should be okay.
+  // but since both returns have the same type it should be okay.
   CallOptions.setTypeListBeforeSoften({OpsVT}, VT, true);
 
   auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT, Ops, CallOptions, DL,
@@ -824,12 +825,17 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSINCOS(SDNode *N) {
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
     return DAG.getLoad(NVT, DL, Chain, StackSlot, PtrInfo);
   };
-  SetSoftenedFloat(SDValue(N, 0), CreateStackLoad(StackSlotSin));
-  SetSoftenedFloat(SDValue(N, 1), CreateStackLoad(StackSlotCos));
+  SetSoftenedFloat(SDValue(N, 0), CreateStackLoad(FirstResultSlot));
+  SetSoftenedFloat(SDValue(N, 1), CreateStackLoad(SecondResultSlot));
 
   return SDValue();
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSINCOS(SDNode *N) {
+  return SoftenFloatRes_UnaryWithTwoFPResults(
+      N, RTLIB::getFSINCOS(N->getValueType(0)));
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
   return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
                                                RTLIB::REM_F32,
@@ -2761,7 +2767,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FFREXP:     R = PromoteFloatRes_FFREXP(N); break;
 
     case ISD::FSINCOS:
-      R = PromoteFloatRes_FSINCOS(N);
+      R = PromoteFloatRes_UnaryWithTwoFPResults(N);
       break;
 
     case ISD::FP_ROUND:   R = PromoteFloatRes_FP_ROUND(N); break;
@@ -2959,7 +2965,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FFREXP(SDNode *N) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::PromoteFloatRes_FSINCOS(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteFloatRes_UnaryWithTwoFPResults(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Op = GetPromotedFloat(N->getOperand(0));
@@ -3223,7 +3229,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FFREXP:      R = SoftPromoteHalfRes_FFREXP(N); break;
 
   case ISD::FSINCOS:
-    R = SoftPromoteHalfRes_FSINCOS(N);
+    R = SoftPromoteHalfRes_UnaryWithTwoFPResults(N);
     break;
 
   case ISD::LOAD:        R = SoftPromoteHalfRes_LOAD(N); break;
@@ -3382,7 +3388,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FFREXP(SDNode *N) {
   return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
 }
 
-SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FSINCOS(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryWithTwoFPResults(SDNode *N) {
   EVT OVT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
   SDValue Op = GetSoftPromotedHalf(N->getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 45487c887b74d..648719bcabc37 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -83,6 +83,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::EXTRACT_VECTOR_ELT:
                          Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
   case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break;
+  case ISD::VP_LOAD:
+    Res = PromoteIntRes_VP_LOAD(cast<VPLoadSDNode>(N));
+    break;
   case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));
     break;
   case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
@@ -957,6 +960,23 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_VP_LOAD(VPLoadSDNode *N) {
+  assert(!N->isIndexed() && "Indexed vp_load during type legalization!");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ISD::LoadExtType ExtType = (N->getExtensionType() == ISD::NON_EXTLOAD)
+                                 ? ISD::EXTLOAD
+                                 : N->getExtensionType();
+  SDLoc dl(N);
+  SDValue Res =
+      DAG.getLoadVP(N->getAddressingMode(), ExtType, NVT, dl, N->getChain(),
+                    N->getBasePtr(), N->getOffset(), N->getMask(),
+                    N->getVectorLength(), N->getMemoryVT(), N->getMemOperand());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue ExtPassThru = GetPromotedInteger(N->getPassThru());
@@ -1957,6 +1977,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::STRICT_SINT_TO_FP: Res = PromoteIntOp_STRICT_SINT_TO_FP(N); break;
   case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
                                                    OpNo); break;
+  case ISD::VP_STORE:
+    Res = PromoteIntOp_VP_STORE(cast<VPStoreSDNode>(N), OpNo);
+    break;
   case ISD::MSTORE:       Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
                                                     OpNo); break;
   case ISD::MLOAD:        Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
@@ -2378,6 +2401,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
                            N->getMemoryVT(), N->getMemOperand());
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_STORE(VPStoreSDNode *N,
+                                                unsigned OpNo) {
+
+  assert(OpNo == 1 && "Unexpected operand for promotion");
+  assert(!N->isIndexed() && "expecting unindexed vp_store!");
+
+  SDValue DataOp = GetPromotedInteger(N->getValue());
+  return DAG.getTruncStoreVP(N->getChain(), SDLoc(N), DataOp, N->getBasePtr(),
+                             N->getMask(), N->getVectorLength(),
+                             N->getMemoryVT(), N->getMemOperand(),
+                             N->isCompressingStore());
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
                                               unsigned OpNo) {
   SDValue DataOp = N->getValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a56cd5423e00b..1703149aca746 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -338,6 +338,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_FREEZE(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_VP_LOAD(VPLoadSDNode *N);
   SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
   SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
   SDValue PromoteIntRes_VECTOR_COMPRESS(SDNode *N);
@@ -420,6 +421,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_ExpOp(SDNode *N);
   SDValue PromoteIntOp_VECREDUCE(SDNode *N);
   SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SET_ROUNDING(SDNode *N);
   SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
@@ -555,6 +557,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   // Convert Float Results to Integer.
   void SoftenFloatResult(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC);
+  SDValue SoftenFloatRes_UnaryWithTwoFPResults(SDNode *N, RTLIB::Libcall LC);
   SDValue SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_ARITH_FENCE(SDNode *N);
@@ -741,13 +744,13 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void PromoteFloatResult(SDNode *N, unsigned ResNo);
   SDValue PromoteFloatRes_BITCAST(SDNode *N);
   SDValue PromoteFloatRes_BinOp(SDNode *N);
+  SDValue PromoteFloatRes_UnaryWithTwoFPResults(SDNode *N);
   SDValue PromoteFloatRes_ConstantFP(SDNode *N);
   SDValue PromoteFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteFloatRes_FCOPYSIGN(SDNode *N);
   SDValue PromoteFloatRes_FMAD(SDNode *N);
   SDValue PromoteFloatRes_ExpOp(SDNode *N);
   SDValue PromoteFloatRes_FFREXP(SDNode *N);
-  SDValue PromoteFloatRes_FSINCOS(SDNode *N);
   SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
   SDValue PromoteFloatRes_STRICT_FP_ROUND(SDNode *N);
   SDValue PromoteFloatRes_LOAD(SDNode *N);
@@ -789,6 +792,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SoftPromoteHalfResult(SDNode *N, unsigned ResNo);
   SDValue SoftPromoteHalfRes_ARITH_FENCE(SDNode *N);
   SDValue SoftPromoteHalfRes_BinOp(SDNode *N);
+  SDValue SoftPromoteHalfRes_UnaryWithTwoFPResults(SDNode *N);
   SDValue SoftPromoteHalfRes_BITCAST(SDNode *N);
   SDValue SoftPromoteHalfRes_ConstantFP(SDNode *N);
   SDValue SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N);
@@ -796,7 +800,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftPromoteHalfRes_FMAD(SDNode *N);
   SDValue SoftPromoteHalfRes_ExpOp(SDNode *N);
   SDValue SoftPromoteHalfRes_FFREXP(SDNode *N);
-  SDValue SoftPromoteHalfRes_FSINCOS(SDNode *N);
   SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N);
   SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index dcce484a7a37e..a0ce7810f91b0 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -413,6 +413,15 @@ bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
   return false;
 }
 
+std::optional<uint64_t> DWARFDie::getLanguage() const {
+  if (isValid()) {
+    if (std::optional<DWARFFormValue> LV =
+            U->getUnitDIE().find(dwarf::DW_AT_language))
+      return LV->getAsUnsignedConstant();
+  }
+  return std::nullopt;
+}
+
 Expected<DWARFLocationExpressionsVector>
 DWARFDie::getLocations(dwarf::Attribute Attr) const {
   std::optional<DWARFFormValue> Location = find(Attr);
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index 42622ea12152a..872b24e59932c 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 772a3fa93c51e..700fb03addce2 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -13,7 +13,6 @@
 #include "llvm-c/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
index 1250c0defd314..4c7a58accacc7 100644
--- a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
+++ b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
@@ -12,7 +12,6 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
 #include <mutex>
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/COFF.cpp b/llvm/lib/ExecutionEngine/JITLink/COFF.cpp
index f4701bc830d66..4503642f52a9d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/COFF.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/COFF.cpp
@@ -15,8 +15,6 @@
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/ExecutionEngine/JITLink/COFF_x86_64.h"
 #include "llvm/Object/COFF.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include <cstring>
 
 using namespace llvm;
diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
index f0cf551ffae5a..8b6c88da52eb8 100644
--- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
@@ -10,7 +10,6 @@
 
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Config/config.h"
-#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/Support/DynamicLibrary.h"
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
index fdcce20cd2d10..d92feeccf5e92 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -21,8 +21,6 @@
 #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
 #include "llvm/Object/ELF.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include <cstring>
 
 using namespace llvm;
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index ba30a76934e11..9041dc3a52dcf 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -17,8 +17,6 @@
 #include "llvm/ExecutionEngine/JITLink/i386.h"
 #include "llvm/ExecutionEngine/JITLink/loongarch.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 01144763ac4ca..254c04b198612 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -12,9 +12,6 @@
 
 #include "JITLinkGeneric.h"
 
-#include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/MemoryBuffer.h"
-
 #define DEBUG_TYPE "jitlink"
 
 namespace llvm {
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
index 40086ccf2b66f..f1181b59234e4 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
@@ -16,8 +16,6 @@
 #include "llvm/ExecutionEngine/JITLink/MachO_arm64.h"
 #include "llvm/ExecutionEngine/JITLink/MachO_x86_64.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SwapByteOrder.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 00be2f57d0664..d183498f30a68 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -13,10 +13,8 @@
 #include "llvm/ExecutionEngine/JITLink/aarch32.h"
 
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 #include "llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h"
-#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index e3b7db2380bb0..5f3067b2a97ea 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -19,7 +19,6 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DynamicLibrary.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index 9296bc2b389ab..9a90af0e8d738 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -7,13 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/Layer.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <string>
 
 using namespace llvm;
 using namespace llvm::orc;
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
index fad7428e1f906..5d2f3cd4a8be8 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
@@ -15,13 +15,10 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Target/TargetMachine.h"
 
-#include <algorithm>
-
 namespace llvm {
 namespace orc {
 
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
index acbf33888adee..9859e4a4c6fd1 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
@@ -19,9 +19,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h"
 #include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
index 0f6923a7633f3..de8d003408871 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
@@ -12,7 +12,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.cpp
index fffecfc978144..767115f984c1f 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h"
 
-#include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/Debugging/DebugInfoSupport.h"
 #include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index 3874f25751b1a..c48c73769955a 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -9,17 +9,12 @@
 
 #include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
 
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/ppc64.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
-#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h"
-#include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
index 1ca6e5e5413bd..9f7d517d481d4 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
@@ -9,9 +9,6 @@
 #include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h"
 
 #include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
-#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
-#include "llvm/Support/BinaryStreamWriter.h"
 
 namespace llvm {
 namespace orc {
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
index b05f08fd7cdfe..50e6b254595b8 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h"
 
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
-#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 
 #include <limits>
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
index fbe25d70c38a2..060f17c957ef9 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h"
-#include "llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index efaed1b82d0eb..edb49a41ecd12 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -19,7 +19,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/MachOUniversal.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Target/TargetMachine.h"
 #include <string>
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index 2a3ede90ade0d..aa799687e6d5d 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -12,7 +12,6 @@
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Process.h"
 #include "llvm/TargetParser/Host.h"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
index d5b11349277c1..b2255d9eef64b 100644
--- a/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
-#include "llvm/Support/MemoryBuffer.h"
 
 namespace llvm {
 namespace orc {
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 5724f96c2d568..ee9acf0ab33a4 100644
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -7,17 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInstrAnalysis.h"
-#include "llvm/Support/Format.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include <sstream>
 
 #define DEBUG_TYPE "orc"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp b/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp
index 9fefa76ed7247..5a08083497d0d 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp
@@ -9,8 +9,6 @@
 #include "llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 
-#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
-
 #define DEBUG_TYPE "orc"
 
 using namespace llvm;
diff --git a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index 8d4e79c7d8aff..6dad7124a88e2 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
-#include "llvm/ADT/StringMap.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index b31f462956549..b49c80b9287ed 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_THREADS
-#include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
-#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/COFFPlatform.h"
 #include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
 #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h"
@@ -21,7 +19,6 @@
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/GlobalVariable.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 822316c4bf996..b4326a17de3b7 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -13,12 +13,8 @@
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
-#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
 #include "llvm/ExecutionEngine/Orc/MachOBuilder.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h"
-#include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
index 9abf7c11d5467..fdff0e6af7aab 100644
--- a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Mangler.h"
-#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "orc"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
index d099a251232e7..93fe7eeb3ed5b 100644
--- a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h"
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 #include "llvm/Support/Process.h"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index 50062d69b8768..944fca000d61f 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -11,7 +11,6 @@
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/Support/WindowsError.h"
-#include <algorithm>
 
 #if defined(LLVM_ON_UNIX) && !defined(__ANDROID__)
 #include <fcntl.h>
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp
index 0286b0c931974..71221c2d3fce5 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp
@@ -7,15 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
-#include "llvm/ExecutionEngine/Orc/COFFPlatform.h"
-#include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
-#include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Debug.h"
 #include <optional>
 
 #define DEBUG_TYPE "orc"
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 86c08cbdee5f3..592941752805a 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -9,9 +9,7 @@
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
 #include "llvm/ExecutionEngine/JITLink/aarch32.h"
-#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
-#include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h"
 #include "llvm/Support/MemoryBuffer.h"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp b/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
index 6d568199378a0..6b138cd319c68 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "orc"
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
index 7e142ac9da07d..e4ab2c0fd03ae 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_THREADS
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/FormatVariadic.h"
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
index 0f9612bae074c..41b97527f38d6 100644
--- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h"
 #include "llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h"
-#include "llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/Support/FormatVariadic.h"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp b/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
index 8f42de91b5bb5..3bdc5b46c529c 100644
--- a/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
@@ -18,8 +18,6 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/ErrorHandling.h"
 
-#include <algorithm>
-
 namespace {
 using namespace llvm;
 SmallVector<const BasicBlock *, 8> findBBwithCalls(const Function &F,
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
index 6347032f010be..68215017fb200 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
@@ -8,14 +8,11 @@
 
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
 
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
 
 #include <cstdint>
 #include <mutex>
-#include <utility>
 
 #define DEBUG_TYPE "orc"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
index f40d93fefb875..ec576599760cd 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
@@ -27,7 +27,6 @@
 
 #include <sys/mman.h> // mmap()
 #include <time.h>     // clock_gettime(), time(), localtime_r() */
-#include <unistd.h>   // for read(), close()
 
 #define DEBUG_TYPE "orc"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
index 57ac991ee37f3..fb7cf94fa0654 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
@@ -11,7 +11,6 @@
 
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.h"
 #include "llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h"
-#include <map>
 
 #if LLVM_USE_INTEL_JITEVENTS
 #include "IntelJITEventsWrapper.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
index fdae0e45da65f..f11a73bb5c7ac 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
@@ -9,15 +9,10 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 
 #include "llvm/Config/config.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include "llvm/Support/FormatVariadic.h"
-
 #define DEBUG_TYPE "orc"
 
 using namespace llvm;
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp
index b7e256a826ca4..db6f20130db9e 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h"
 
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
-#include "llvm/Support/FormatVariadic.h"
 
 #define DEBUG_TYPE "orc"
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp
index a585767bf474e..4862b2d3e7f79 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h"
 
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
-#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Process.h"
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
index 73b37ee0ff331..69e95654666e1 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
@@ -15,7 +15,6 @@
 #include "Targets/RuntimeDyldCOFFI386.h"
 #include "Targets/RuntimeDyldCOFFThumb.h"
 #include "Targets/RuntimeDyldCOFFX86_64.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index d3d3735e4ea53..ff48a938cbd42 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
 #include "RuntimeDyldCheckerImpl.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -21,9 +20,7 @@
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
 #include <cctype>
 #include <memory>
 #include <utility>
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 5f8dc41433564..c4326200b4dd4 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -11,9 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RuntimeDyldELF.h"
-#include "RuntimeDyldCheckerImpl.h"
 #include "Targets/RuntimeDyldELFMips.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 9ca76602ea18e..d9a59733dc4cc 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -15,7 +15,6 @@
 #include "Targets/RuntimeDyldMachOARM.h"
 #include "Targets/RuntimeDyldMachOI386.h"
 #include "Targets/RuntimeDyldMachOX86_64.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 
 using namespace llvm;
diff --git a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
index 436888730bfb2..e3391992b8204 100644
--- a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Process.h"
 
 namespace llvm {
diff --git a/llvm/lib/Frontend/Atomic/Atomic.cpp b/llvm/lib/Frontend/Atomic/Atomic.cpp
index 03b476d113bd4..b54312293f9b0 100644
--- a/llvm/lib/Frontend/Atomic/Atomic.cpp
+++ b/llvm/lib/Frontend/Atomic/Atomic.cpp
@@ -8,11 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Frontend/Atomic/Atomic.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Frontend/Atomic/Atomic.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Operator.h"
 
 namespace {} // namespace
 
diff --git a/llvm/lib/Frontend/HLSL/HLSLResource.cpp b/llvm/lib/Frontend/HLSL/HLSLResource.cpp
index dd2b0b60bc19b..48310d4f28e67 100644
--- a/llvm/lib/Frontend/HLSL/HLSLResource.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLResource.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Frontend/HLSL/HLSLResource.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
 
 using namespace llvm;
 using namespace llvm::hlsl;
diff --git a/llvm/lib/Frontend/Offloading/Utility.cpp b/llvm/lib/Frontend/Offloading/Utility.cpp
index 7a0a7afcfcb5c..9e85ffbfe22d7 100644
--- a/llvm/lib/Frontend/Offloading/Utility.cpp
+++ b/llvm/lib/Frontend/Offloading/Utility.cpp
@@ -16,7 +16,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/MemoryBufferRef.h"
-#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Frontend/OpenACC/ACC.cpp b/llvm/lib/Frontend/OpenACC/ACC.cpp
index 1e0516021bc6d..07e67fbec79dd 100644
--- a/llvm/lib/Frontend/OpenACC/ACC.cpp
+++ b/llvm/lib/Frontend/OpenACC/ACC.cpp
@@ -8,9 +8,7 @@
 
 #include "llvm/Frontend/OpenACC/ACC.h.inc"
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 using namespace acc;
diff --git a/llvm/lib/Frontend/OpenMP/OMP.cpp b/llvm/lib/Frontend/OpenMP/OMP.cpp
index fdb09678d7a4c..2792dc4281015 100644
--- a/llvm/lib/Frontend/OpenMP/OMP.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMP.cpp
@@ -9,15 +9,11 @@
 #include "llvm/Frontend/OpenMP/OMP.h"
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/StringSaver.h"
 
 #include <algorithm>
 #include <cstdio>
@@ -193,6 +189,11 @@ bool isCombinedConstruct(Directive D) {
   return !getLeafConstructs(D).empty() && !isCompositeConstruct(D);
 }
 
+ArrayRef<unsigned> getOpenMPVersions() {
+  static unsigned Versions[]{45, 50, 51, 52, 60};
+  return Versions;
+}
+
 std::string prettifyFunctionName(StringRef FunctionName) {
   // Internalized functions have the right name, but simply a suffix.
   if (FunctionName.ends_with(".internalized"))
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index d2e4dc1c85dfd..59d34f67f8cfb 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -59,7 +59,6 @@
 
 #include <cstdint>
 #include <optional>
-#include <stack>
 
 #define DEBUG_TYPE "openmp-ir-builder"
 
diff --git a/llvm/lib/FuzzMutate/Operations.cpp b/llvm/lib/FuzzMutate/Operations.cpp
index 389ff8130771c..c9cad7d142760 100644
--- a/llvm/lib/FuzzMutate/Operations.cpp
+++ b/llvm/lib/FuzzMutate/Operations.cpp
@@ -182,9 +182,9 @@ OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
 
       // We need values for each phi in the block. Since there isn't a good way
       // to do a variable number of input values currently, we just fill them
-      // with undef.
+      // with poison.
       for (PHINode &PHI : Block->phis())
-        PHI.addIncoming(UndefValue::get(PHI.getType()), Block);
+        PHI.addIncoming(PoisonValue::get(PHI.getType()), Block);
     }
     return nullptr;
   };
@@ -342,7 +342,7 @@ static SourcePred validShuffleVectorIndex() {
     // TODO: It's straighforward to make up reasonable values, but listing them
     // exhaustively would be insane. Come up with a couple of sensible ones.
     return std::vector<Constant *>{
-        UndefValue::get(VectorType::get(Int32Ty, FirstTy->getElementCount()))};
+        PoisonValue::get(VectorType::get(Int32Ty, FirstTy->getElementCount()))};
   };
   return {Pred, Make};
 }
diff --git a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
index b645888463b12..a684307586a61 100644
--- a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
+++ b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
@@ -81,7 +81,7 @@ RandomIRBuilder::findOrCreateGlobalVariable(Module *M, ArrayRef<Value *> Srcs,
   auto MatchesPred = [&Srcs, &Pred](GlobalVariable *GV) {
     // Can't directly compare GV's type, as it would be a pointer to the actual
     // type.
-    return Pred.matches(Srcs, UndefValue::get(GV->getValueType()));
+    return Pred.matches(Srcs, PoisonValue::get(GV->getValueType()));
   };
   bool DidCreate = false;
   SmallVector<GlobalVariable *, 4> GlobalVars;
@@ -368,9 +368,9 @@ Instruction *RandomIRBuilder::newSink(BasicBlock &BB,
   if (!Ptr) {
     if (uniform(Rand, 0, 1)) {
       Type *Ty = V->getType();
-      Ptr = createStackMemory(BB.getParent(), Ty, UndefValue::get(Ty));
+      Ptr = createStackMemory(BB.getParent(), Ty, PoisonValue::get(Ty));
     } else {
-      Ptr = UndefValue::get(PointerType::get(V->getType(), 0));
+      Ptr = PoisonValue::get(PointerType::get(V->getType(), 0));
     }
   }
 
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 61a051821a5db..d81a292916fde 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -20,12 +20,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
@@ -191,7 +192,7 @@ CmpInst::Predicate ConstantRange::getEquivalentPredWithFlippedSignedness(
          "Only for relational integer predicates!");
 
   CmpInst::Predicate FlippedSignednessPred =
-      CmpInst::getFlippedSignednessPredicate(Pred);
+      ICmpInst::getFlippedSignednessPredicate(Pred);
 
   if (areInsensitiveToSignednessOfICmpPredicate(CR1, CR2))
     return FlippedSignednessPred;
diff --git a/llvm/lib/IR/DIExpressionOptimizer.cpp b/llvm/lib/IR/DIExpressionOptimizer.cpp
index 2bb8eac348c8e..be9e13a34235a 100644
--- a/llvm/lib/IR/DIExpressionOptimizer.cpp
+++ b/llvm/lib/IR/DIExpressionOptimizer.cpp
@@ -59,12 +59,14 @@ foldOperationIfPossible(uint64_t Const1, uint64_t Const2,
     return Const1 - Const2;
   }
   case dwarf::DW_OP_shl: {
-    if ((uint64_t)countl_zero(Const1) < Const2)
+    if (Const2 >= std::numeric_limits<uint64_t>::digits ||
+        static_cast<uint64_t>(countl_zero(Const1)) < Const2)
       return std::nullopt;
     return Const1 << Const2;
   }
   case dwarf::DW_OP_shr: {
-    if ((uint64_t)countr_zero(Const1) < Const2)
+    if (Const2 >= std::numeric_limits<uint64_t>::digits ||
+        static_cast<uint64_t>(countr_zero(Const1)) < Const2)
       return std::nullopt;
     return Const1 >> Const2;
   }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 5b89a27126150..7350c65b35fb7 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3718,40 +3718,6 @@ CmpInst::Predicate CmpInst::getFlippedStrictnessPredicate(Predicate pred) {
   llvm_unreachable("Unknown predicate!");
 }
 
-CmpInst::Predicate CmpInst::getSignedPredicate(Predicate pred) {
-  assert(CmpInst::isUnsigned(pred) && "Call only with unsigned predicates!");
-
-  switch (pred) {
-  default:
-    llvm_unreachable("Unknown predicate!");
-  case CmpInst::ICMP_ULT:
-    return CmpInst::ICMP_SLT;
-  case CmpInst::ICMP_ULE:
-    return CmpInst::ICMP_SLE;
-  case CmpInst::ICMP_UGT:
-    return CmpInst::ICMP_SGT;
-  case CmpInst::ICMP_UGE:
-    return CmpInst::ICMP_SGE;
-  }
-}
-
-CmpInst::Predicate CmpInst::getUnsignedPredicate(Predicate pred) {
-  assert(CmpInst::isSigned(pred) && "Call only with signed predicates!");
-
-  switch (pred) {
-  default:
-    llvm_unreachable("Unknown predicate!");
-  case CmpInst::ICMP_SLT:
-    return CmpInst::ICMP_ULT;
-  case CmpInst::ICMP_SLE:
-    return CmpInst::ICMP_ULE;
-  case CmpInst::ICMP_SGT:
-    return CmpInst::ICMP_UGT;
-  case CmpInst::ICMP_SGE:
-    return CmpInst::ICMP_UGE;
-  }
-}
-
 bool CmpInst::isUnsigned(Predicate predicate) {
   switch (predicate) {
     default: return false;
@@ -3867,7 +3833,7 @@ std::optional<bool> ICmpInst::compare(const KnownBits &LHS,
   }
 }
 
-CmpInst::Predicate CmpInst::getFlippedSignednessPredicate(Predicate pred) {
+CmpInst::Predicate ICmpInst::getFlippedSignednessPredicate(Predicate pred) {
   assert(CmpInst::isRelational(pred) &&
          "Call only with non-equality predicates!");
 
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 88ede0d35fa3e..75f4751ea4f14 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -72,6 +72,22 @@ bool Type::isScalableTy() const {
   return isScalableTy(Visited);
 }
 
+bool Type::containsNonGlobalTargetExtType(
+    SmallPtrSetImpl<const Type *> &Visited) const {
+  if (const auto *ATy = dyn_cast<ArrayType>(this))
+    return ATy->getElementType()->containsNonGlobalTargetExtType(Visited);
+  if (const auto *STy = dyn_cast<StructType>(this))
+    return STy->containsNonGlobalTargetExtType(Visited);
+  if (auto *TT = dyn_cast<TargetExtType>(this))
+    return !TT->hasProperty(TargetExtType::CanBeGlobal);
+  return false;
+}
+
+bool Type::containsNonGlobalTargetExtType() const {
+  SmallPtrSet<const Type *, 4> Visited;
+  return containsNonGlobalTargetExtType(Visited);
+}
+
 const fltSemantics &Type::getFltSemantics() const {
   switch (getTypeID()) {
   case HalfTyID: return APFloat::IEEEhalf();
@@ -425,6 +441,34 @@ bool StructType::isScalableTy(SmallPtrSetImpl<const Type *> &Visited) const {
   return false;
 }
 
+bool StructType::containsNonGlobalTargetExtType(
+    SmallPtrSetImpl<const Type *> &Visited) const {
+  if ((getSubclassData() & SCDB_ContainsNonGlobalTargetExtType) != 0)
+    return true;
+
+  if ((getSubclassData() & SCDB_NotContainsNonGlobalTargetExtType) != 0)
+    return false;
+
+  if (!Visited.insert(this).second)
+    return false;
+
+  for (Type *Ty : elements()) {
+    if (Ty->containsNonGlobalTargetExtType(Visited)) {
+      const_cast<StructType *>(this)->setSubclassData(
+          getSubclassData() | SCDB_ContainsNonGlobalTargetExtType);
+      return true;
+    }
+  }
+
+  // For structures that are opaque, return false but do not set the
+  // SCDB_NotContainsNonGlobalTargetExtType flag since it may gain non-global
+  // target extension types when it becomes non-opaque.
+  if (!isOpaque())
+    const_cast<StructType *>(this)->setSubclassData(
+        getSubclassData() | SCDB_NotContainsNonGlobalTargetExtType);
+  return false;
+}
+
 bool StructType::containsHomogeneousScalableVectorTypes() const {
   if (getNumElements() <= 0 || !isa<ScalableVectorType>(elements().front()))
     return false;
@@ -903,7 +947,7 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
 
   // DirectX resources
   if (Name.starts_with("dx."))
-    return TargetTypeInfo(PointerType::get(C, 0));
+    return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal);
 
   // Opaque types in the AMDGPU name space.
   if (Name == "amdgcn.named.barrier") {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 254edf7990c70..21b8816081a0b 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -829,8 +829,10 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
 }
 
 void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
+  Type *GVType = GV.getValueType();
+
   if (GV.hasInitializer()) {
-    Check(GV.getInitializer()->getType() == GV.getValueType(),
+    Check(GV.getInitializer()->getType() == GVType,
           "Global variable initializer type does not match global "
           "variable type!",
           &GV);
@@ -854,7 +856,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
 
     // Don't worry about emitting an error for it not being an array,
     // visitGlobalValue will complain on appending non-array.
-    if (ArrayType *ATy = dyn_cast<ArrayType>(GV.getValueType())) {
+    if (ArrayType *ATy = dyn_cast<ArrayType>(GVType)) {
       StructType *STy = dyn_cast<StructType>(ATy->getElementType());
       PointerType *FuncPtrTy =
           PointerType::get(Context, DL.getProgramAddressSpace());
@@ -878,7 +880,6 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
     Check(GV.materialized_use_empty(),
           "invalid uses of intrinsic global variable", &GV);
 
-    Type *GVType = GV.getValueType();
     if (ArrayType *ATy = dyn_cast<ArrayType>(GVType)) {
       PointerType *PTy = dyn_cast<PointerType>(ATy->getElementType());
       Check(PTy, "wrong type for intrinsic global variable", &GV);
@@ -912,15 +913,13 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
 
   // Scalable vectors cannot be global variables, since we don't know
   // the runtime size.
-  Check(!GV.getValueType()->isScalableTy(),
-        "Globals cannot contain scalable types", &GV);
-
-  // Check if it's a target extension type that disallows being used as a
-  // global.
-  if (auto *TTy = dyn_cast<TargetExtType>(GV.getValueType()))
-    Check(TTy->hasProperty(TargetExtType::CanBeGlobal),
-          "Global @" + GV.getName() + " has illegal target extension type",
-          TTy);
+  Check(!GVType->isScalableTy(), "Globals cannot contain scalable types", &GV);
+
+  // Check if it is or contains a target extension type that disallows being
+  // used as a global.
+  Check(!GVType->containsNonGlobalTargetExtType(),
+        "Global @" + GV.getName() + " has illegal target extension type",
+        GVType);
 
   if (!GV.hasInitializer()) {
     visitGlobalValue(GV);
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 0d54c534590ca..4bb0ddf891744 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -82,8 +82,6 @@ class TypeMapTy : public ValueMapTypeRemapper {
   Type *get(Type *SrcTy);
   Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited);
 
-  void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes);
-
   FunctionType *get(FunctionType *T) {
     return cast<FunctionType>(get((Type *)T));
   }
@@ -233,20 +231,6 @@ Error TypeMapTy::linkDefinedTypeBodies() {
   return Error::success();
 }
 
-void TypeMapTy::finishType(StructType *DTy, StructType *STy,
-                           ArrayRef<Type *> ETypes) {
-  DTy->setBody(ETypes, STy->isPacked());
-
-  // Steal STy's name.
-  if (STy->hasName()) {
-    SmallString<16> TmpName = STy->getName();
-    STy->setName("");
-    DTy->setName(TmpName);
-  }
-
-  DstStructTypesSet.addNonOpaque(DTy);
-}
-
 Type *TypeMapTy::get(Type *Ty) {
   SmallPtrSet<StructType *, 8> Visited;
   return get(Ty, Visited);
@@ -292,17 +276,9 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
     AnyChange |= ElementTypes[I] != Ty->getContainedType(I);
   }
 
-  // If we found our type while recursively processing stuff, just use it.
+  // Refresh Entry after recursively processing stuff.
   Entry = &MappedTypes[Ty];
-  if (*Entry) {
-    if (auto *DTy = dyn_cast<StructType>(*Entry)) {
-      if (DTy->isOpaque()) {
-        auto *STy = cast<StructType>(Ty);
-        finishType(DTy, STy, ElementTypes);
-      }
-    }
-    return *Entry;
-  }
+  assert(!*Entry && "Recursive type!");
 
   // If all of the element types mapped directly over and the type is not
   // a named struct, then the type is usable as-is.
@@ -350,8 +326,17 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
       return *Entry = Ty;
     }
 
-    StructType *DTy = StructType::create(Ty->getContext());
-    finishType(DTy, STy, ElementTypes);
+    StructType *DTy =
+        StructType::create(Ty->getContext(), ElementTypes, "", STy->isPacked());
+
+    // Steal STy's name.
+    if (STy->hasName()) {
+      SmallString<16> TmpName = STy->getName();
+      STy->setName("");
+      DTy->setName(TmpName);
+    }
+
+    DstStructTypesSet.addNonOpaque(DTy);
     return *Entry = DTy;
   }
   }
diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
index 92618bdabbe51..abaf0f0246183 100644
--- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
+++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
@@ -145,7 +145,7 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() {
 
   static cl::opt<bool> X86RelaxRelocations(
       "x86-relax-relocations",
-      cl::desc("Emit GOTPCRELX/REX_GOTPCRELX/REX2_GOTPCRELX instead of "
+      cl::desc("Emit GOTPCRELX/REX_GOTPCRELX/CODE_4_GOTPCRELX instead of "
                "GOTPCREL on x86-64 ELF"),
       cl::init(true));
   MCBINDOPT(X86RelaxRelocations);
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 9857eb0de7a80..e755e8161d36d 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 9dc39936ffd8b..76415aed1387c 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/HexagonAttributeParser.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributeParser.h"
 #include "llvm/Support/RISCVAttributes.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
@@ -550,6 +549,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
     return "gfx941";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942:
     return "gfx942";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950:
+    return "gfx950";
 
   // AMDGCN GFX10.
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010:
diff --git a/llvm/lib/Object/MachOUniversal.cpp b/llvm/lib/Object/MachOUniversal.cpp
index c2c2b67814dcd..655875f3396c1 100644
--- a/llvm/lib/Object/MachOUniversal.cpp
+++ b/llvm/lib/Object/MachOUniversal.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SwapByteOrder.h"
-#include "llvm/Support/type_traits.h"
 
 using namespace llvm;
 using namespace object;
diff --git a/llvm/lib/Object/Minidump.cpp b/llvm/lib/Object/Minidump.cpp
index 83c527e84365f..a96e23893bfba 100644
--- a/llvm/lib/Object/Minidump.cpp
+++ b/llvm/lib/Object/Minidump.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/Minidump.h"
-#include "llvm/Object/Error.h"
 #include "llvm/Support/ConvertUTF.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Object/ModuleSymbolTable.cpp b/llvm/lib/Object/ModuleSymbolTable.cpp
index fb36a88b9c714..54e654a0d121c 100644
--- a/llvm/lib/Object/ModuleSymbolTable.cpp
+++ b/llvm/lib/Object/ModuleSymbolTable.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "RecordStreamer.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
@@ -42,7 +41,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <memory>
diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp
index 89dc12551494f..56687c9acb653 100644
--- a/llvm/lib/Object/OffloadBinary.cpp
+++ b/llvm/lib/Object/OffloadBinary.cpp
@@ -15,15 +15,12 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/Archive.h"
-#include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
-#include "llvm/Object/COFF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Alignment.h"
-#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index d9bb8f175ef83..a28f5943c320d 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -17,7 +17,6 @@
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/ELFTypes.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/Casting.h"
diff --git a/llvm/lib/Object/TapiFile.cpp b/llvm/lib/Object/TapiFile.cpp
index 4eaacc48e7ce0..6b9e89e85b273 100644
--- a/llvm/lib/Object/TapiFile.cpp
+++ b/llvm/lib/Object/TapiFile.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Object/TapiFile.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Object/Error.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/InterfaceFile.h"
diff --git a/llvm/lib/Object/TapiUniversal.cpp b/llvm/lib/Object/TapiUniversal.cpp
index 4db5841a8f34c..6667a68f5c8c6 100644
--- a/llvm/lib/Object/TapiUniversal.cpp
+++ b/llvm/lib/Object/TapiUniversal.cpp
@@ -12,9 +12,7 @@
 
 #include "llvm/Object/TapiUniversal.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Object/Error.h"
 #include "llvm/Object/TapiFile.h"
-#include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/TextAPIReader.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index ef19c3c62835f..dcd728dfa2d95 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -21,16 +21,13 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/TargetParser/Triple.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <cstring>
-#include <limits>
 
 #define DEBUG_TYPE "wasm-object"
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 130b8798ab4a4..ca0ea03452d3b 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -609,6 +609,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH);
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index e507b241d9828..cdf4412c6477a 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -34,7 +33,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index adfd22804356e..12b1687af69db 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <limits>
 #include <vector>
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index cd2c5d72ea522..e43f3ac9f08d4 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -26,7 +26,6 @@
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include <algorithm>
 #include <cstddef>
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 47f463541d8ef..d90629ad57f5b 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
@@ -350,6 +349,36 @@ bool InstrProfWriter::addMemProfCallStack(
   return true;
 }
 
+bool InstrProfWriter::addMemProfData(memprof::IndexedMemProfData Incoming,
+                                     function_ref<void(Error)> Warn) {
+  // TODO: Once we remove support for MemProf format Version V1, assert that
+  // the three components (frames, call stacks, and records) are either all
+  // empty or populated.
+
+  if (MemProfData.Frames.empty())
+    MemProfData.Frames = std::move(Incoming.Frames);
+  else
+    for (const auto &[Id, F] : Incoming.Frames)
+      if (addMemProfFrame(Id, F, Warn))
+        return false;
+
+  if (MemProfData.CallStacks.empty())
+    MemProfData.CallStacks = std::move(Incoming.CallStacks);
+  else
+    for (const auto &[CSId, CS] : Incoming.CallStacks)
+      if (addMemProfCallStack(CSId, CS, Warn))
+        return false;
+
+  // Add one record at a time if randomization is requested.
+  if (MemProfData.Records.empty() && !MemprofGenerateRandomHotness)
+    MemProfData.Records = std::move(Incoming.Records);
+  else
+    for (const auto &[GUID, Record] : Incoming.Records)
+      addMemProfRecord(GUID, Record);
+
+  return true;
+}
+
 void InstrProfWriter::addBinaryIds(ArrayRef<llvm::object::BuildID> BIs) {
   llvm::append_range(BinaryIds, BIs);
 }
@@ -606,7 +635,7 @@ writeMemProfCallStackArray(
   llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
       MemProfCallStackIndexes;
 
-  memprof::CallStackRadixTreeBuilder Builder;
+  memprof::CallStackRadixTreeBuilder<memprof::FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   for (auto I : Builder.getRadixArray())
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 6c4bda2d9264f..9d5ac748d7975 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -436,10 +436,12 @@ CallStackId hashCallStack(ArrayRef<FrameId> CS) {
 // To quickly determine the location of the common prefix within RadixArray,
 // Indexes caches the indexes of the previous call stack's frames within
 // RadixArray.
-LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
-    const llvm::SmallVector<FrameId> *CallStack,
-    const llvm::SmallVector<FrameId> *Prev,
-    const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {
+template <typename FrameIdTy>
+LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
+    const llvm::SmallVector<FrameIdTy> *CallStack,
+    const llvm::SmallVector<FrameIdTy> *Prev,
+    std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
+        MemProfFrameIndexes) {
   // Compute the length of the common root prefix between Prev and CallStack.
   uint32_t CommonLen = 0;
   if (Prev) {
@@ -464,10 +466,11 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
 
   // Copy the part of the call stack beyond the common prefix to RadixArray.
   assert(CommonLen <= CallStack->size());
-  for (FrameId F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
+  for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
     // Remember the index of F in RadixArray.
     Indexes.push_back(RadixArray.size());
-    RadixArray.push_back(MemProfFrameIndexes.find(F)->second);
+    RadixArray.push_back(
+        MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
   }
   assert(CallStack->size() == Indexes.size());
 
@@ -479,11 +482,13 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
   return RadixArray.size() - 1;
 }
 
-void CallStackRadixTreeBuilder::build(
-    llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+template <typename FrameIdTy>
+void CallStackRadixTreeBuilder<FrameIdTy>::build(
+    llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
         &&MemProfCallStackData,
-    const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
-    llvm::DenseMap<FrameId, FrameStat> &FrameHistogram) {
+    std::optional<const llvm::DenseMap<FrameIdTy, LinearFrameId>>
+        MemProfFrameIndexes,
+    llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
   // Take the vector portion of MemProfCallStackData.  The vector is exactly
   // what we need to sort.  Also, we no longer need its lookup capability.
   llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
@@ -535,7 +540,7 @@ void CallStackRadixTreeBuilder::build(
     // root.
     return std::lexicographical_compare(
         L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
-        [&](FrameId F1, FrameId F2) {
+        [&](FrameIdTy F1, FrameIdTy F2) {
           uint64_t H1 = FrameHistogram[F1].Count;
           uint64_t H2 = FrameHistogram[F2].Count;
           // Popular frames should come later because we encode call stacks from
@@ -585,7 +590,7 @@ void CallStackRadixTreeBuilder::build(
   // traverse CallStacks in the reverse order, then Call Stack 3 has the
   // complete call stack encoded without any pointers.  Call Stack 1 and 2 point
   // to appropriate prefixes of Call Stack 3.
-  const llvm::SmallVector<FrameId> *Prev = nullptr;
+  const llvm::SmallVector<FrameIdTy> *Prev = nullptr;
   for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
     LinearCallStackId Pos =
         encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
@@ -608,10 +613,14 @@ void CallStackRadixTreeBuilder::build(
     V = RadixArray.size() - 1 - V;
 }
 
-llvm::DenseMap<FrameId, FrameStat>
-computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+// Explicitly instantiate class with the utilized FrameIdTy.
+template class CallStackRadixTreeBuilder<FrameId>;
+
+template <typename FrameIdTy>
+llvm::DenseMap<FrameIdTy, FrameStat>
+computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
                           &MemProfCallStackData) {
-  llvm::DenseMap<FrameId, FrameStat> Histogram;
+  llvm::DenseMap<FrameIdTy, FrameStat> Histogram;
 
   for (const auto &KV : MemProfCallStackData) {
     const auto &CS = KV.second;
@@ -624,6 +633,11 @@ computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
   return Histogram;
 }
 
+// Explicitly instantiate function with the utilized FrameIdTy.
+template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>(
+    llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+        &MemProfCallStackData);
+
 void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
   for (const auto &AS : Record.AllocSites) {
     assert(AS.CSId == hashCallStack(AS.CallStack));
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index e3c85e179ac43..de5b4c23c58a0 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <type_traits>
diff --git a/llvm/lib/ProfileData/PGOCtxProfReader.cpp b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
index 496854e63c565..eb89d7c2f6d1d 100644
--- a/llvm/lib/ProfileData/PGOCtxProfReader.cpp
+++ b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
@@ -12,12 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/PGOCtxProfReader.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/PGOCtxProfWriter.h"
-#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 
 using namespace llvm;
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 8ded44fdc073b..6fc16d9effdd6 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -22,14 +22,12 @@
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Compression.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <memory>
diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp
index d35e3ba84990f..27ed37aa9bdd3 100644
--- a/llvm/lib/SandboxIR/Tracker.cpp
+++ b/llvm/lib/SandboxIR/Tracker.cpp
@@ -10,12 +10,75 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/StructuralHash.h"
 #include "llvm/SandboxIR/Instruction.h"
 #include <sstream>
 
 using namespace llvm::sandboxir;
 
 #ifndef NDEBUG
+
+std::string IRSnapshotChecker::dumpIR(const llvm::Function &F) const {
+  std::string Result;
+  raw_string_ostream SS(Result);
+  F.print(SS, /*AssemblyAnnotationWriter=*/nullptr);
+  return Result;
+}
+
+IRSnapshotChecker::ContextSnapshot IRSnapshotChecker::takeSnapshot() const {
+  ContextSnapshot Result;
+  for (const auto &Entry : Ctx.LLVMModuleToModuleMap)
+    for (const auto &F : *Entry.first) {
+      FunctionSnapshot Snapshot;
+      Snapshot.Hash = StructuralHash(F, /*DetailedHash=*/true);
+      Snapshot.TextualIR = dumpIR(F);
+      Result[&F] = Snapshot;
+    }
+  return Result;
+}
+
+bool IRSnapshotChecker::diff(const ContextSnapshot &Orig,
+                             const ContextSnapshot &Curr) const {
+  bool DifferenceFound = false;
+  for (const auto &[F, OrigFS] : Orig) {
+    auto CurrFSIt = Curr.find(F);
+    if (CurrFSIt == Curr.end()) {
+      DifferenceFound = true;
+      dbgs() << "Function " << F->getName() << " not found in current IR.\n";
+      dbgs() << OrigFS.TextualIR << "\n";
+      continue;
+    }
+    const FunctionSnapshot &CurrFS = CurrFSIt->second;
+    if (OrigFS.Hash != CurrFS.Hash) {
+      DifferenceFound = true;
+      dbgs() << "Found IR difference in Function " << F->getName() << "\n";
+      dbgs() << "Original:\n" << OrigFS.TextualIR << "\n";
+      dbgs() << "Current:\n" << CurrFS.TextualIR << "\n";
+    }
+  }
+  // Check that Curr doesn't contain any new functions.
+  for (const auto &[F, CurrFS] : Curr) {
+    if (!Orig.contains(F)) {
+      DifferenceFound = true;
+      dbgs() << "Function " << F->getName()
+             << " found in current IR but not in original snapshot.\n";
+      dbgs() << CurrFS.TextualIR << "\n";
+    }
+  }
+  return DifferenceFound;
+}
+
+void IRSnapshotChecker::save() { OrigContextSnapshot = takeSnapshot(); }
+
+void IRSnapshotChecker::expectNoDiff() {
+  ContextSnapshot CurrContextSnapshot = takeSnapshot();
+  if (diff(OrigContextSnapshot, CurrContextSnapshot)) {
+    llvm_unreachable(
+        "Original and current IR differ! Probably a checkpointing bug.");
+  }
+}
+
 void UseSet::dump() const {
   dump(dbgs());
   dbgs() << "\n";
@@ -275,7 +338,12 @@ void CmpSwapOperands::dump() const {
 }
 #endif
 
-void Tracker::save() { State = TrackerState::Record; }
+void Tracker::save() {
+  State = TrackerState::Record;
+#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS)
+  SnapshotChecker.save();
+#endif
+}
 
 void Tracker::revert() {
   assert(State == TrackerState::Record && "Forgot to save()!");
@@ -283,6 +351,9 @@ void Tracker::revert() {
   for (auto &Change : reverse(Changes))
     Change->revert(*this);
   Changes.clear();
+#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS)
+  SnapshotChecker.expectNoDiff();
+#endif
 }
 
 void Tracker::accept() {
diff --git a/llvm/lib/Support/ARMBuildAttrs.cpp b/llvm/lib/Support/ARMBuildAttrs.cpp
index 6ff74e02820da..815cfc62a4b0e 100644
--- a/llvm/lib/Support/ARMBuildAttrs.cpp
+++ b/llvm/lib/Support/ARMBuildAttrs.cpp
@@ -7,10 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ARMBuildAttributes.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/LEB128.h"
-#include <iomanip>
-#include <sstream>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp
index 3fa7365e72d34..4952fe65d7767 100644
--- a/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -10,7 +10,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/SwapByteOrder.h"
 #include <string>
 #include <vector>
 
diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
index f1b730e2b58c4..8b23b05913291 100644
--- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <map>
 using namespace llvm;
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index 2b3ddd39b0879..eb89f6f8915cb 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -8,13 +8,10 @@
 
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/AutoConvert.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/SwapByteOrder.h"
 
 #ifdef _WIN32
 #include "llvm/Support/Windows/WindowsSupport.h"
diff --git a/llvm/lib/Support/LockFileManager.cpp b/llvm/lib/Support/LockFileManager.cpp
index 4a8dd89b7619b..9a45a9966458e 100644
--- a/llvm/lib/Support/LockFileManager.cpp
+++ b/llvm/lib/Support/LockFileManager.cpp
@@ -25,7 +25,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <system_error>
-#include <thread>
 #include <tuple>
 
 #ifdef _WIN32
diff --git a/llvm/lib/Support/MSP430AttributeParser.cpp b/llvm/lib/Support/MSP430AttributeParser.cpp
index 27694b8f60f36..bfc69cb2bb3ce 100644
--- a/llvm/lib/Support/MSP430AttributeParser.cpp
+++ b/llvm/lib/Support/MSP430AttributeParser.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/Support/MSP430AttributeParser.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 using namespace llvm::MSP430Attrs;
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index aea81964ba9fd..7ea68ee4cafd7 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
diff --git a/llvm/lib/Support/NativeFormatting.cpp b/llvm/lib/Support/NativeFormatting.cpp
index 3b9273e1eaadb..7a64730434193 100644
--- a/llvm/lib/Support/NativeFormatting.cpp
+++ b/llvm/lib/Support/NativeFormatting.cpp
@@ -11,7 +11,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <cmath>
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 4db9bc80b415b..d775285197103 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
diff --git a/llvm/lib/Support/Process.cpp b/llvm/lib/Support/Process.cpp
index 54462f23c8424..57bcc2d116dad 100644
--- a/llvm/lib/Support/Process.cpp
+++ b/llvm/lib/Support/Process.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Program.h"
 
 #include <optional>
 #include <stdlib.h> // for _Exit
diff --git a/llvm/lib/Support/SuffixTreeNode.cpp b/llvm/lib/Support/SuffixTreeNode.cpp
index 9f1f94a39895e..dee8c5816b8bb 100644
--- a/llvm/lib/Support/SuffixTreeNode.cpp
+++ b/llvm/lib/Support/SuffixTreeNode.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/SuffixTreeNode.h"
-#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp
index 7cc7ba44cc72d..693de0e6400fb 100644
--- a/llvm/lib/Support/Threading.cpp
+++ b/llvm/lib/Support/Threading.cpp
@@ -16,10 +16,8 @@
 #include "llvm/Config/llvm-config.h"
 
 #include <cassert>
-#include <errno.h>
 #include <optional>
 #include <stdlib.h>
-#include <string.h>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index b3cdaa3eefc90..5febdf992fbfe 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -37,7 +37,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <cstdint>
diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp
index 56b557646100b..d259da65c5cf7 100644
--- a/llvm/lib/Support/YAMLTraits.cpp
+++ b/llvm/lib/Support/YAMLTraits.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <cstring>
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 5d30c797ebf5b..91fb4dbf16720 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/AutoConvert.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Duration.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 04b3233084a41..7a4be5759f900 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -19,7 +19,6 @@
 #include <atomic>
 #include <fcntl.h>
 #include <functional>
-#include <thread>
 
 #ifndef _WIN32
 #include <poll.h>
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 8af8cdfeba6ac..1b1d81fcd07a2 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -322,13 +322,13 @@ def AArch64PostLegalizerCombiner
                         extractvecelt_pairwise_add, redundant_or,
                         mul_const, redundant_sext_inreg,
                         form_bitfield_extract, rotate_out_of_range,
-                        icmp_to_true_false_known_bits,
-                        select_combines, fold_merge_to_zext,
+                        icmp_to_true_false_known_bits, overflow_combines,
+                        select_combines, fold_merge_to_zext, merge_combines,
                         constant_fold_binops, identity_combines,
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
-                        commute_constant_to_rhs, merge_combines,
+                        commute_constant_to_rhs,
                         push_freeze_to_prevent_poison_from_propagating,
                         combine_mul_cmlt, combine_use_vector_truncate]> {
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1969c830f4d31..10dad7675f4ea 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -920,8 +920,7 @@ bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
   if ((MulImm % std::abs(Scale)) == 0) {
     int64_t RDVLImm = MulImm / Scale;
     if ((RDVLImm >= Low) && (RDVLImm <= High)) {
-      Imm = CurDAG->getSignedConstant(RDVLImm, SDLoc(N), MVT::i32,
-                                      /*isTarget=*/true);
+      Imm = CurDAG->getSignedTargetConstant(RDVLImm, SDLoc(N), MVT::i32);
       return true;
     }
   }
@@ -4283,7 +4282,7 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
     int64_t ImmVal = CNode->getSExtValue();
     SDLoc DL(N);
     if (ImmVal >= -128 && ImmVal < 128) {
-      Imm = CurDAG->getSignedConstant(ImmVal, DL, MVT::i32, /*isTarget=*/true);
+      Imm = CurDAG->getSignedTargetConstant(ImmVal, DL, MVT::i32);
       return true;
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2732e495c552a..7ab3fc06715ec 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3547,11 +3547,10 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
     RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
                       {LHS.getValue(1), RHS});
     Chain = RHS.getValue(1);
-    VT = MVT::f32;
   }
   unsigned Opcode =
       IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
-  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
+  return DAG.getNode(Opcode, dl, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
 }
 
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -3564,9 +3563,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
-      VT = MVT::f32;
     }
-    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+    return DAG.getNode(AArch64ISD::FCMP, dl, MVT::i32, LHS, RHS);
   }
 
   // The CMP instruction is just an alias for SUBS, and representing it as
@@ -5186,40 +5184,6 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
   return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
 }
 
-static EVT getExtensionTo64Bits(const EVT &OrigVT) {
-  if (OrigVT.getSizeInBits() >= 64)
-    return OrigVT;
-
-  assert(OrigVT.isSimple() && "Expecting a simple value type");
-
-  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
-  switch (OrigSimpleTy) {
-  default: llvm_unreachable("Unexpected Vector Type");
-  case MVT::v2i8:
-  case MVT::v2i16:
-     return MVT::v2i32;
-  case MVT::v4i8:
-    return  MVT::v4i16;
-  }
-}
-
-static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
-                                                 const EVT &OrigTy,
-                                                 const EVT &ExtTy,
-                                                 unsigned ExtOpcode) {
-  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
-  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
-  // 64-bits we need to insert a new extension so that it will be 64-bits.
-  assert(ExtTy.is128BitVector() && "Unexpected extension size");
-  if (OrigTy.getSizeInBits() >= 64)
-    return N;
-
-  // Must extend size to at least 64 bits to be used as an operand for VMULL.
-  EVT NewVT = getExtensionTo64Bits(OrigTy);
-
-  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
-}
-
 // Returns lane if Op extracts from a two-element vector and lane is constant
 // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
 static std::optional<uint64_t>
@@ -5265,31 +5229,11 @@ static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
 static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
   EVT VT = N.getValueType();
   assert(VT.is128BitVector() && "Unexpected vector MULL size");
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned OrigEltSize = VT.getScalarSizeInBits();
-  unsigned EltSize = OrigEltSize / 2;
-  MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
-
-  APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
-  if (DAG.MaskedValueIsZero(N, HiBits))
-    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
-
-  if (ISD::isExtOpcode(N.getOpcode()))
-    return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
-                                             N.getOperand(0).getValueType(), VT,
-                                             N.getOpcode());
-
-  assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
-  SDLoc dl(N);
-  SmallVector<SDValue, 8> Ops;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    const APInt &CInt = N.getConstantOperandAPInt(i);
-    // Element types smaller than 32 bits are not legal, so use i32 elements.
-    // The values are implicitly truncated so sext vs. zext doesn't matter.
-    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
-  }
-  return DAG.getBuildVector(TruncVT, dl, Ops);
+  EVT HalfVT = EVT::getVectorVT(
+      *DAG.getContext(),
+      VT.getScalarType().getHalfSizedIntegerVT(*DAG.getContext()),
+      VT.getVectorElementCount());
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
 }
 
 static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
@@ -5465,33 +5409,26 @@ static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
   if (IsN0ZExt && IsN1ZExt)
     return AArch64ISD::UMULL;
 
-  // Select SMULL if we can replace zext with sext.
-  if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
-      !isExtendedBUILD_VECTOR(N0, DAG, false) &&
-      !isExtendedBUILD_VECTOR(N1, DAG, false)) {
-    SDValue ZextOperand;
-    if (IsN0ZExt)
-      ZextOperand = N0.getOperand(0);
-    else
-      ZextOperand = N1.getOperand(0);
-    if (DAG.SignBitIsZero(ZextOperand)) {
-      SDValue NewSext =
-          DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
-      if (IsN0ZExt)
-        N0 = NewSext;
-      else
-        N1 = NewSext;
-      return AArch64ISD::SMULL;
-    }
-  }
-
   // Select UMULL if we can replace the other operand with an extend.
+  EVT VT = N0.getValueType();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
   if (IsN0ZExt || IsN1ZExt) {
-    EVT VT = N0.getValueType();
-    APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(),
-                                       VT.getScalarSizeInBits() / 2);
     if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
       return AArch64ISD::UMULL;
+  } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
+             DAG.MaskedValueIsZero(N1, Mask)) {
+    // For v2i64 we look more aggresively at both operands being zero, to avoid
+    // scalarization.
+    return AArch64ISD::UMULL;
+  }
+
+  if (IsN0SExt || IsN1SExt) {
+    if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
+      return AArch64ISD::SMULL;
+  } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
+             DAG.ComputeNumSignBits(N1) > EltSize / 2) {
+    return AArch64ISD::SMULL;
   }
 
   if (!IsN1SExt && !IsN1ZExt)
@@ -9299,8 +9236,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     // Each tail call may have to adjust the stack by a different amount, so
     // this information must travel along with the operation for eventual
     // consumption by emitEpilogue.
-    Ops.push_back(
-        DAG.getSignedConstant(FPDiff, DL, MVT::i32, /*isTarget=*/true));
+    Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
   }
 
   if (CLI.PAI) {
@@ -13722,11 +13658,11 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSize = VT.getScalarSizeInBits();
   if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
-    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1);
   if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
-    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1);
   if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
-    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1);
 
   if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
       ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
@@ -15743,7 +15679,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
     if (IsMinusOne)
-      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
+      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
   case AArch64CC::LE:
     if (IsZero)
@@ -21630,7 +21566,7 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
   // Set condition code (CC) flags.
   SDValue Test = DAG.getNode(
       Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
-      DL, MVT::Other, Pg, Op);
+      DL, MVT::i32, Pg, Op);
 
   // Convert CC to integer based on requested condition.
   // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
@@ -26436,8 +26372,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                   : AArch64SysReg::RNDRRS);
       SDLoc DL(N);
       SDValue A = DAG.getNode(
-          AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
-          N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
+          AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other),
+          N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
       SDValue B = DAG.getNode(
           AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
           DAG.getConstant(0, DL, MVT::i32),
@@ -28227,7 +28163,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
   if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
     assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
            "Incorrect mask type");
-    Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
+    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
   }
   Mask = convertFixedMaskToScalableVector(Mask, DAG);
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 15d4e93b915c1..242aea5fbb014 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5911,34 +5911,34 @@ multiclass FPComparison<bit signalAllNans, string asm,
                         SDPatternOperator OpNode = null_frag> {
   let Defs = [NZCV] in {
   def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
-      [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm))]> {
+      [(set NZCV, (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> {
     let Inst{23-22} = 0b11;
     let Predicates = [HasFullFP16];
   }
 
   def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm,
-      [(OpNode (f16 FPR16:$Rn), fpimm0)]> {
+      [(set NZCV, (OpNode (f16 FPR16:$Rn), fpimm0))]> {
     let Inst{23-22} = 0b11;
     let Predicates = [HasFullFP16];
   }
 
   def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm))]> {
+      [(set NZCV, (OpNode FPR32:$Rn, (f32 FPR32:$Rm)))]> {
     let Inst{23-22} = 0b00;
   }
 
   def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode (f32 FPR32:$Rn), fpimm0)]> {
+      [(set NZCV, (OpNode (f32 FPR32:$Rn), fpimm0))]> {
     let Inst{23-22} = 0b00;
   }
 
   def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm))]> {
+      [(set NZCV, (OpNode FPR64:$Rn, (f64 FPR64:$Rm)))]> {
     let Inst{23-22} = 0b01;
   }
 
   def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode (f64 FPR64:$Rn), fpimm0)]> {
+      [(set NZCV, (OpNode (f64 FPR64:$Rn), fpimm0))]> {
     let Inst{23-22} = 0b01;
   }
   } // Defs = [NZCV]
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c8d4291c5f280..b4b3eccf82427 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -438,9 +438,9 @@ def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
                                       SDTCisInt<3>,
                                       SDTCisInt<4>,
                                       SDTCisVT<5, i32>]>;
-def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
-                                   [SDTCisFP<0>,
-                                    SDTCisSameAs<0, 1>]>;
+def SDT_AArch64FCmp  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                            SDTCisFP<1>,
+                                            SDTCisSameAs<2, 1>]>;
 def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
 def SDT_AArch64Insr  : SDTypeProfile<1, 2, [SDTCisVec<0>]>;
@@ -881,8 +881,7 @@ def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
 
 def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
                                     SDT_AArch64TLSDescCallSeq,
-                                    [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                                     SDNPVariadic]>;
+                                    [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>;
 
 
 def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
@@ -992,8 +991,10 @@ def AArch64probedalloca
              [SDNPHasChain, SDNPMayStore]>;
 
 def AArch64mrs : SDNode<"AArch64ISD::MRS",
-                        SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
-                        [SDNPHasChain, SDNPOutGlue]>;
+                        SDTypeProfile<2, 1, [SDTCisVT<0, i64>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<2, i32>]>,
+                        [SDNPHasChain]>;
 
 def SD_AArch64rshrnb : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<2>]>;
 def AArch64rshrnb : SDNode<"AArch64ISD::RSHRNB_I", SD_AArch64rshrnb>;
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index a6ba6ddc30b27..37ac915d1d880 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -876,7 +876,7 @@ defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100
 defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x4>;
 defm FMLA_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x2>;
 defm FMLA_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x4>;
-defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x2>;
+defm FMLA_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x2>;
 defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x4>;
 
 defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x2>;
@@ -884,7 +884,7 @@ defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ
 defm FMLS_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x2>;
 defm FMLS_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x4>;
 defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x2>;
-defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x4>;
+defm FMLS_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x4>;
 
 defm FCVT_2ZZ_H  : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>;
 defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 8791ce6266c86..564fb33758ad5 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -376,7 +376,11 @@ def AArch64fadda_p : PatFrags<(ops node:$op1, node:$op2, node:$op3),
      (AArch64fadda_p_node (SVEAllActive), node:$op2,
              (vselect node:$op1, node:$op3, (splat_vector (f64 fpimm_minus0))))]>;
 
-def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64PTest : SDTypeProfile<1, 2, [
+  SDTCisVT<0, i32>,  // out flags
+  SDTCisVec<1>,      // governing predicate
+  SDTCisSameAs<2, 1> // source predicate
+]>;
 def AArch64ptest     : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
 def AArch64ptest_any : SDNode<"AArch64ISD::PTEST_ANY", SDT_AArch64PTest>;
 
@@ -971,7 +975,7 @@ let Predicates = [HasSVEorSME] in {
   def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
 } // End HasSVEorSME
 
-let Predicates = [HasSVE] in {
+let Predicates = [HasNonStreamingSVEorSME2p2] in {
   defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
 } // End HasSVE
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 2a9a7533f8625..e37e2cacc7852 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -255,12 +255,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     MaxBytesForLoopAlignment = 16;
     break;
   case NeoverseV2:
-    // Specialize cost for Neoverse-V2.
+  case NeoverseV3:
+    EpilogueVectorizationMinVF = 8;
+    MaxInterleaveFactor = 4;
     ScatterOverhead = 13;
     LLVM_FALLTHROUGH;
   case NeoverseN2:
   case NeoverseN3:
-  case NeoverseV3:
     PrefFunctionAlignment = Align(16);
     PrefLoopAlignment = Align(32);
     MaxBytesForLoopAlignment = 16;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 91fef0e9a1ae9..d860c29e2291a 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -56,6 +56,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool ATTRIBUTE = DEFAULT;
 #include "AArch64GenSubtargetInfo.inc"
 
+  unsigned EpilogueVectorizationMinVF = 16;
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 2;
   uint16_t CacheLineSize = 0;
@@ -237,6 +238,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
            hasFuseAdrpAdd() || hasFuseLiterals();
   }
 
+  unsigned getEpilogueVectorizationMinVF() const {
+    return EpilogueVectorizationMinVF;
+  }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const;
   unsigned getCacheLineSize() const override { return CacheLineSize; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a97b0d3b1db92..ec7bb71fd111f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4736,6 +4736,10 @@ static bool containsDecreasingPointers(Loop *TheLoop,
   return false;
 }
 
+unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
+  return ST->getEpilogueVectorizationMinVF();
+}
+
 bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
   if (!ST->hasSVE())
     return false;
@@ -5239,6 +5243,22 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     // Is it profitable to sink if we found two of the same type of extends.
     return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
   }
+  case Instruction::FMul: {
+    // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
+    if (I->getType()->isScalableTy())
+      return false;
+
+    if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
+        !ST->hasFullFP16())
+      return false;
+
+    // Sink splats for index lane variants
+    if (isSplatShuffle(I->getOperand(0)))
+      Ops.push_back(&I->getOperandUse(0));
+    if (isSplatShuffle(I->getOperand(1)))
+      Ops.push_back(&I->getOperandUse(1));
+    return !Ops.empty();
+  }
   default:
     return false;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a01d061c4c407..201bc831b816b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return ST->useFixedOverScalableIfEqualCost();
   }
 
+  unsigned getEpilogueVectorizationMinVF() const;
+
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
 
   bool supportsScalableVectors() const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index baa42302756a5..c8f01068f7218 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -103,7 +103,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v8s8, v16s8)
       .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v2s32, v4s32)
-      .clampNumElements(0, v2s64, v2s64);
+      .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2);
 
   getActionDefinitionsBuilder(G_PHI)
       .legalFor({p0, s16, s32, s64})
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 60705e2b6d4e7..1ddb913f013f5 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -836,7 +836,7 @@ class sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op>
 : I<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
   asm, "\t$Pg, $Pn",
   "",
-  [(op (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn))]>, Sched<[]> {
+  [(set NZCV, (op (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn)))]>, Sched<[]> {
   bits<4> Pg;
   bits<4> Pn;
   let Inst{31-24} = 0b00100101;
@@ -860,7 +860,7 @@ multiclass sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op,
 
   let hasNoSchedulingInfo = 1, isCompare = 1, Defs = [NZCV] in {
   def _ANY : Pseudo<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
-                    [(op_any (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn))]>,
+                    [(set NZCV, (op_any (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn)))]>,
              PseudoInstExpansion<(!cast<Instruction>(NAME) PPRAny:$Pg, PPR8:$Pn)>;
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index c8ae010414dc4..d3543015d667f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -137,6 +137,18 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
   "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
 >;
 
+def FeatureMinimum3Maximum3F32 : SubtargetFeature<"minimum3-maximum3-f32",
+  "HasMinimum3Maximum3F32",
+  "true",
+  "Has v_minimum3_f32 and v_maximum3_f32 instructions"
+>;
+
+def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
+  "HasMinimum3Maximum3F16",
+  "true",
+  "Has v_minimum3_f16 and v_maximum3_f16 instructions"
+>;
+
 def FeatureSupportsXNACK : SubtargetFeature<"xnack-support",
   "SupportsXNACK",
   "true",
@@ -360,6 +372,12 @@ def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
   "Additional instructions for GFX940+"
 >;
 
+def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
+  "GFX950Insts",
+  "true",
+  "Additional instructions for GFX950+"
+>;
+
 def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
   "GFX10Insts",
   "true",
@@ -420,6 +438,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
   "Use true 16-bit registers"
 >;
 
+def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
+  "HasBF16ConversionInsts",
+  "true",
+  "Has bf16 conversion instructions"
+>;
+
 def FeatureVOP3P : SubtargetFeature<"vop3p",
   "HasVOP3PInsts",
   "true",
@@ -960,6 +984,12 @@ def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order",
   "VMEM instructions of the same type write VGPR results in order"
 >;
 
+def FeaturePrngInst : SubtargetFeature<"prng-inst",
+  "HasPrngInst",
+  "true",
+  "Has v_prng_b32 instruction"
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -1174,7 +1204,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
 
 def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
   "gfx9",
-  [FeatureFP64, FeatureAddressableLocalMemorySize65536,
+  [FeatureFP64,
    FeatureWavefrontSize64, FeatureFlatAddressSpace,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
@@ -1257,6 +1287,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
    FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
+   FeatureMinimum3Maximum3F32, FeatureMinimum3Maximum3F16,
    FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   ]
 >;
@@ -1339,6 +1370,7 @@ def FeatureISAVersion8_1_0 : FeatureSet<
 
 def FeatureISAVersion9_0_Common : FeatureSet<
   [FeatureGFX9,
+   FeatureAddressableLocalMemorySize65536,
    FeatureLDSBankCount32,
    FeatureImageInsts,
    FeatureMadMacF32Insts]>;
@@ -1356,7 +1388,8 @@ def FeatureISAVersion9_Generic : FeatureSet<
 
 def FeatureISAVersion9_0_MI_Common : FeatureSet<
   !listconcat(FeatureISAVersion9_0_Common.Features,
-    [FeatureFmaMixInsts,
+    [FeatureAddressableLocalMemorySize65536,
+     FeatureFmaMixInsts,
      FeatureDLInsts,
      FeatureDot1Insts,
      FeatureDot2Insts,
@@ -1470,9 +1503,21 @@ def FeatureISAVersion9_4_Common : FeatureSet<
    FeatureFlatBufferGlobalAtomicFaddF64Inst
    ]>;
 
+def FeatureISAVersion9_5_Common : FeatureSet<
+  !listconcat(FeatureISAVersion9_4_Common.Features,
+  [FeatureAddressableLocalMemorySize163840,
+   FeatureFP8Insts,
+   FeatureFP8ConversionInsts,
+   FeatureCvtFP8VOP1Bug,
+   FeatureGFX950Insts,
+   FeaturePrngInst,
+   FeatureBF16ConversionInsts
+   ])>;
+
 def FeatureISAVersion9_4_0 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
     [
+      FeatureAddressableLocalMemorySize65536,
       FeatureForceStoreSC0SC1,
       FeatureFP8Insts,
       FeatureFP8ConversionInsts,
@@ -1483,6 +1528,7 @@ def FeatureISAVersion9_4_0 : FeatureSet<
 def FeatureISAVersion9_4_1 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
     [
+      FeatureAddressableLocalMemorySize65536,
       FeatureForceStoreSC0SC1,
       FeatureFP8Insts,
       FeatureFP8ConversionInsts,
@@ -1493,6 +1539,7 @@ def FeatureISAVersion9_4_1 : FeatureSet<
 def FeatureISAVersion9_4_2 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
     [
+      FeatureAddressableLocalMemorySize65536,
       FeatureFP8Insts,
       FeatureFP8ConversionInsts,
       FeatureCvtFP8VOP1Bug,
@@ -1501,7 +1548,10 @@ def FeatureISAVersion9_4_2 : FeatureSet<
 
 def FeatureISAVersion9_4_Generic : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
-    [FeatureRequiresCOV6])>;
+    [FeatureAddressableLocalMemorySize65536,
+     FeatureRequiresCOV6])>;
+
+def FeatureISAVersion9_5_0 : FeatureSet<FeatureISAVersion9_5_Common.Features>;
 
 def FeatureISAVersion10_Common : FeatureSet<
   [FeatureGFX10,
@@ -1933,6 +1983,10 @@ def isNotGFX940Plus :
   Predicate<"!Subtarget->hasGFX940Insts()">,
   AssemblerPredicate<(all_of (not FeatureGFX940Insts))>;
 
+def HasGFX950Insts :
+  Predicate<"Subtarget->hasGFX950Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX950Insts)>;
+
 def isGFX8GFX9NotGFX940 :
   Predicate<"!Subtarget->hasGFX940Insts() &&"
             "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
@@ -1989,6 +2043,15 @@ def isGFX12Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
   AssemblerPredicate<(all_of FeatureGFX12Insts)>;
 
+def HasMinimum3Maximum3F32 :
+  Predicate<"Subtarget->hasMinimum3Maximum3F32()">,
+  AssemblerPredicate<(all_of FeatureMinimum3Maximum3F32)>;
+
+def HasMinimum3Maximum3F16 :
+  Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
+  AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;
+
+
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
   AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
@@ -2092,6 +2155,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
   // FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
   // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
 
+def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
+  AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
+
 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<(all_of FeatureVOP3P)>;
 
@@ -2103,8 +2169,10 @@ def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()
 
 def HasFminFmaxLegacy : Predicate<"Subtarget->hasFminFmaxLegacy()">;
 
-def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
-  AssemblerPredicate<(all_of FeatureSDWA, FeatureVolcanicIslands)>;
+def HasSDWA : Predicate<"Subtarget->hasSDWA()">;
+
+def HasSDWA8 : Predicate<"Subtarget->hasSDWA()">,
+  AssemblerPredicate<(all_of (not FeatureGFX9Insts), FeatureSDWA)>;
 
 def HasSDWA9 :
   Predicate<"Subtarget->hasSDWA()">,
@@ -2303,6 +2371,9 @@ def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
 def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
   AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
 
+def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">,
+  AssemblerPredicate<(all_of FeaturePrngInst)>;
+
 def HasGDS : Predicate<"Subtarget->hasGDS()">;
 
 def HasGWS : Predicate<"Subtarget->hasGWS()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d801f2b159127..90c341ac0819c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1172,12 +1172,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
   unsigned LDSAlignShift;
-  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-    // LDS is allocated in 64 dword blocks.
-    LDSAlignShift = 8;
-  } else {
+  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+    // LDS is allocated in 320 dword blocks.
+    LDSAlignShift = 11;
+  } else if (STM.getFeatureBits().test(
+                 FeatureAddressableLocalMemorySize65536)) {
     // LDS is allocated in 128 dword blocks.
     LDSAlignShift = 9;
+  } else {
+    // LDS is allocated in 64 dword blocks.
+    LDSAlignShift = 8;
   }
 
   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
@@ -1590,9 +1594,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
   if (UserSGPRInfo.hasPrivateSegmentSize())
     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
 
-  if (UserSGPRInfo.hasDispatchPtr())
-    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
-
   if (STM.isXNACKEnabled())
     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index f832a2a55d622..74d1faeb6f545 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -29,6 +29,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
 
 def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
 def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
+def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
 
 class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
   "wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 8beb9defee66a..28d215e7b3de9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1253,6 +1253,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     break;
   }
+  case Intrinsic::amdgcn_prng_b32: {
+    auto *Src = II.getArgOperand(0);
+    if (isa<UndefValue>(Src)) {
+      return IC.replaceInstUsesWith(II, Src);
+    }
+  }
   }
   if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 13de93e829fab..3522ece24f1c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3240,6 +3240,24 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
                     : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
                                  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
     break;
+  case 12:
+    if (!Subtarget->hasLDSLoadB96_B128())
+      return false;
+
+    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+                                 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+                    : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+                                 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+    break;
+  case 16:
+    if (!Subtarget->hasLDSLoadB96_B128())
+      return false;
+
+    Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+                                 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+                    : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+                                 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+    break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
@@ -3329,6 +3347,16 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   case 4:
     Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
     break;
+  case 12:
+    if (!Subtarget->hasLDSLoadB96_B128())
+      return false;
+    Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
+    break;
+  case 16:
+    if (!Subtarget->hasLDSLoadB96_B128())
+      return false;
+    Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
+    break;
   }
 
   MachineBasicBlock *MBB = MI.getParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 415c068367074..b648b68f3bd2b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4515,6 +4515,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_cvt_pk_u8_f32:
     case Intrinsic::amdgcn_alignbyte:
     case Intrinsic::amdgcn_perm:
+    case Intrinsic::amdgcn_prng_b32:
     case Intrinsic::amdgcn_fdot2:
     case Intrinsic::amdgcn_sdot2:
     case Intrinsic::amdgcn_udot2:
@@ -4746,7 +4747,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
-    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_f16: {
       // Default for MAI intrinsics.
       // srcC can also be an immediate which can be folded later.
       // FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 60fa2adc62dc8..2ea254e64b8cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -333,6 +333,8 @@ foreach intr = AMDGPUMFMAIntrinsics90A in
 def : SourceOfDivergence<intr>;
 foreach intr = AMDGPUMFMAIntrinsics940 in
 def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsics950 in
+def : SourceOfDivergence<intr>;
 foreach intr = AMDGPUWMMAIntrinsicsGFX11 in
 def : SourceOfDivergence<intr>;
 foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 334322f533e54..ece26a4adb375 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -51,6 +51,7 @@ class AMDGPUSubtarget {
   bool Has16BitInsts = false;
   bool HasTrue16BitInsts = false;
   bool EnableRealTrue16Insts = false;
+  bool HasBF16ConversionInsts = false;
   bool HasMadMixInsts = false;
   bool HasMadMacF32Insts = false;
   bool HasDsSrc2Insts = false;
@@ -166,6 +167,10 @@ class AMDGPUSubtarget {
   // supported and the support for fake True16 instructions is removed.
   bool useRealTrue16Insts() const;
 
+  bool hasBF16ConversionInsts() const {
+    return HasBF16ConversionInsts;
+  }
+
   bool hasMadMixInsts() const {
     return HasMadMixInsts;
   }
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 79d6a825f60b0..e5978aee2b39a 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -573,9 +573,17 @@ multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
   }
 }
 
-multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> {
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, Predicate LDSPred = TruePredicate> {
   defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>;
-  defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
+
+  if !ne(LDSPred, TruePredicate) then {
+    let SubtargetPredicate = LDSPred in {
+      defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
+    }
+  } else {
+    defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
+  }
+
 }
 
 multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName,
@@ -732,7 +740,6 @@ class MUBUF_Atomic_Pseudo<string opName,
     MUBUF_SetupAddr<addrKindCopy> {
   let mayStore = 1;
   let mayLoad = 1;
-  let hasPostISelHook = 1;
   let hasSideEffects = 1;
   let DisableWQM = 1;
   let has_glc = 0;
@@ -956,11 +963,11 @@ defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
   "buffer_load_dwordx2", v2i32
 >;
-defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx3", v3i32
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads_Lds <
+  "buffer_load_dwordx3", v3i32, /*LDSPred=*/HasGFX950Insts
 >;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx4", v4i32
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads_Lds <
+  "buffer_load_dwordx4", v4i32, /*LDSPred=*/HasGFX950Insts
 >;
 
 defm BUFFER_LOAD_LDS_B32 : MUBUF_Pseudo_Loads_LDSOpc <
@@ -3231,8 +3238,8 @@ defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_Lds_vi <0x12>;
 defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_Lds_vi <0x13>;
 defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_Lds_vi <0x14>;
 defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_vi <0x15>;
-defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_vi <0x16>;
-defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_vi <0x17>;
+defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_Lds_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_Lds_vi <0x17>;
 defm BUFFER_STORE_BYTE          : MUBUF_Real_AllAddr_vi <0x18>;
 defm BUFFER_STORE_BYTE_D16_HI   : MUBUF_Real_AllAddr_vi <0x19>;
 defm BUFFER_STORE_SHORT         : MUBUF_Real_AllAddr_vi <0x1a>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 7724821bbd7c3..061ffda2498f4 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -194,7 +194,6 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
   (ins VGPR_32:$addr, data_op:$data0, Offset:$offset, gds:$gds),
   " $vdst, $addr, $data0$offset$gds"> {
 
-  let hasPostISelHook = 1;
   let has_data1 = 0;
   let IsAtomicRet = 1;
 }
@@ -223,7 +222,6 @@ class DS_1A2D_RET<string opName,
   (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, Offset:$offset, gds:$gds),
   " $vdst, $addr, $data0, $data1$offset$gds"> {
 
-  let hasPostISelHook = 1;
   let IsAtomicRet = 1;
 }
 
@@ -248,7 +246,6 @@ class DS_1A2D_Off8_RET<string opName,
   " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
 
   let has_offset = 0;
-  let hasPostISelHook = 1;
 }
 
 multiclass DS_1A2D_Off8_RET_mc<string opName,
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 7c293c1a5e512..06df08feda8fa 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -345,6 +345,25 @@ static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm,
                               (AMDGPU::OperandSemantics)OperandSemantics));
 }
 
+template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
+          unsigned OperandSemantics>
+static DecodeStatus
+decodeOperand_VSrcT16_Lo128_Deferred(MCInst &Inst, unsigned Imm,
+                                     uint64_t /*Addr*/,
+                                     const MCDisassembler *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  assert(isUInt<9>(Imm) && "9-bit encoding expected");
+
+  if (Imm & AMDGPU::EncValues::IS_VGPR) {
+    bool IsHi = Imm & (1 << 7);
+    unsigned RegIdx = Imm & 0x7f;
+    return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
+  }
+  return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(
+                              OpWidth, Imm & 0xFF, true, ImmWidth,
+                              (AMDGPU::OperandSemantics)OperandSemantics));
+}
+
 template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
           unsigned OperandSemantics>
 static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db74372e9db45..72b13fd5f3695 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -516,7 +516,6 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
 class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
                             string asm, list<dag> pattern = []>
   : FLAT_AtomicNoRet_Pseudo<opName, outs, ins, asm, pattern> {
-  let hasPostISelHook = 1;
   let has_vdst = 1;
   let glcValue = 1;
   let sccbValue = 0;
@@ -934,6 +933,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
 
+let SubtargetPredicate = HasGFX950Insts in {
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">;
+}
+
 let SubtargetPredicate = isGFX12Plus in {
   defm GLOBAL_ATOMIC_COND_SUB_U32    : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>;
   defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
@@ -1980,6 +1984,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>;
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
 
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;
+
+
 defm GLOBAL_ATOMIC_SWAP       : FLAT_Global_Real_Atomics_vi <0x40>;
 defm GLOBAL_ATOMIC_CMPSWAP    : FLAT_Global_Real_Atomics_vi <0x41>;
 defm GLOBAL_ATOMIC_ADD        : FLAT_Global_Real_Atomics_vi <0x42>;
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 067043d290b76..3403cbab526d4 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -204,6 +204,10 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
   FeatureISAVersion9_4_2.Features
 >;
 
+def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel,
+  FeatureISAVersion9_5_0.Features
+>;
+
 // [gfx900, gfx902, gfx904, gfx906, gfx909, gfx90c]
 def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel,
   FeatureISAVersion9_Generic.Features
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 8a64dc056d7a6..6233ca2eb4f1d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -54,7 +54,7 @@ static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
 static cl::opt<unsigned>
     NSAThreshold("amdgpu-nsa-threshold",
                  cl::desc("Number of addresses from which to enable MIMG NSA."),
-                 cl::init(3), cl::Hidden);
+                 cl::init(2), cl::Hidden);
 
 GCNSubtarget::~GCNSubtarget() = default;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 6ff964077d8fd..f3f96940c1f44 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -106,6 +106,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool GFX9Insts = false;
   bool GFX90AInsts = false;
   bool GFX940Insts = false;
+  bool GFX950Insts = false;
   bool GFX10Insts = false;
   bool GFX11Insts = false;
   bool GFX12Insts = false;
@@ -219,7 +220,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasSALUFloatInsts = false;
   bool HasPseudoScalarTrans = false;
   bool HasRestrictedSOffset = false;
-
+  bool HasPrngInst = false;
   bool HasVcmpxPermlaneHazard = false;
   bool HasVMEMtoScalarWriteHazard = false;
   bool HasSMEMtoVectorWriteHazard = false;
@@ -241,7 +242,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasForceStoreSC0SC1 = false;
   bool HasRequiredExportPriority = false;
   bool HasVmemWriteVgprInOrder = false;
-
+  bool HasMinimum3Maximum3F32 = false;
+  bool HasMinimum3Maximum3F16 = false;
   bool RequiresCOV6 = false;
 
   // Dummy feature to use for assembler in tablegen.
@@ -1283,6 +1285,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // hasGFX90AInsts is also true.
   bool hasGFX940Insts() const { return GFX940Insts; }
 
+  // GFX950 is a derivation to GFX940. hasGFX950Insts() implies that
+  // hasGFX940Insts and hasGFX90AInsts are also true.
+  bool hasGFX950Insts() const { return GFX950Insts; }
+
+  /// Returns true if the target supports
+  /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
+  /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
+  bool hasLDSLoadB96_B128() const {
+    return hasGFX950Insts();
+  }
+
   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
 
   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
@@ -1306,11 +1319,21 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns true if the target has instructions with xf32 format support.
   bool hasXF32Insts() const { return HasXF32Insts; }
 
+  bool hasMinimum3Maximum3F32() const {
+    return HasMinimum3Maximum3F32;
+  }
+
+  bool hasMinimum3Maximum3F16() const {
+    return HasMinimum3Maximum3F16;
+  }
+
   /// \returns The maximum number of instructions that can be enclosed in an
   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
   /// instruction.
   unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
 
+  bool hasPrngInst() const { return HasPrngInst; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 55ba5ebbebb8f..ffde4d33f1341 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -96,6 +96,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940:  AK = GK_GFX940;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941:  AK = GK_GFX941;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942:  AK = GK_GFX942;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950:  AK = GK_GFX950;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
@@ -182,6 +183,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX940:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940;
   case GK_GFX941:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941;
   case GK_GFX942:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942;
+  case GK_GFX950:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950;
   case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
   case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
   case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 13a2db7a87b43..dcd4f0f65e8ef 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1299,7 +1299,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
       MIB.setMIFlag(MachineInstr::FrameDestroy);
   } else {
     // Insert the CSR spill restores with SP as the base register.
-    emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
+    emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits,
+                         FuncInfo->isChainFunction() ? Register() : StackPtrReg,
                          FramePtrRegScratchCopy);
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1e261f4256c93..5b02f9bf80d3f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -889,6 +889,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::MUL, MVT::i1, Promote);
 
+  if (Subtarget->hasBF16ConversionInsts()) {
+    setOperationAction(ISD::FP_ROUND, MVT::v2bf16, Legal);
+    setOperationAction(ISD::FP_ROUND, MVT::bf16, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
+  }
+
   setTargetDAGCombine({ISD::ADD,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
@@ -9819,6 +9825,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
             : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
                          : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
       break;
+    case 12:
+      if (!Subtarget->hasLDSLoadB96_B128())
+        return SDValue();
+      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+                                   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+                                   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+      break;
+    case 16:
+      if (!Subtarget->hasLDSLoadB96_B128())
+        return SDValue();
+      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+                                   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+                                   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+      break;
     }
 
     SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
@@ -9888,6 +9910,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     case 4:
       Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
       break;
+    case 12:
+      if (!Subtarget->hasLDSLoadB96_B128())
+        return SDValue();
+      Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
+      break;
+    case 16:
+      if (!Subtarget->hasLDSLoadB96_B128())
+        return SDValue();
+      Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
+      break;
     }
 
     auto *M = cast<MemSDNode>(Op);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 9f8e6a082d965..d2024cf915874 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1991,13 +1991,14 @@ class getInsVOP3Base<RegisterOperand Src0RC, RegisterOperand Src1RC,
 
 class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
                    RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp, bit HasOpSel,
+                   bit HasNeg,
                    Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
   dag base = getInsVOP3Base<Src0RC, Src1RC, Src2RC, NumSrcArgs,
                     HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/,
                     0/*HasOMod*/, Src0Mod, Src1Mod, Src2Mod, HasOpSel>.ret;
 
   dag vop3pOpsel = (ins op_sel_hi0:$op_sel_hi);
-  dag vop3p_neg = (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi);
+  dag vop3p_neg = !if(HasNeg, (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), (ins));
 
   dag vop3pFields = !con(!if(HasOpSel, vop3pOpsel, (ins)), vop3p_neg);
   dag ret = !con(base, vop3pFields);
@@ -2191,22 +2192,22 @@ class getAsmVOPDPart <int NumSrcArgs, string XorY> {
 
 // Returns the assembly string for the inputs and outputs of a VOP3P
 // instruction.
-class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
+class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasNeg,
                    bit HasClamp, bit HasOpSel> {
-  string dst = "$vdst";
-  string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+  string dst = !if(HasDst, "$vdst"# !if(!gt(NumSrcArgs, 0), ",", ""), "");
+  string src0 = !if(!eq(NumSrcArgs, 1), " $src0", " $src0,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
                    !if(!eq(NumSrcArgs, 2), " $src1",
                                            " $src1,"));
   string src2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
 
-  string mods = !if(HasModifiers, "$neg_lo$neg_hi", "");
+  string mods = !if(HasNeg, "$neg_lo$neg_hi", "");
   string clamp = !if(HasClamp, "$clamp", "");
   string opsel = !if(HasOpSel, "$op_sel$op_sel_hi", "");
 
   // Each modifier is printed as an array of bits for each operand, so
   // all operands are printed as part of src0_modifiers.
-  string ret = dst#", "#src0#src1#src2#opsel#mods#clamp;
+  string ret = dst#src0#src1#src2#opsel#mods#clamp;
 }
 
 // FIXME-TRUE16 AsmVOP3OpSel will be deprecated after all
@@ -2267,7 +2268,7 @@ class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
 
 class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
                        bit HasOpSel, bit HasOMod, bit IsVOP3P,
-                       bit HasModifiers, bit Src0HasMods,
+                       bit HasNeg, bit Src0HasMods,
                        bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32,
                        bit HasByteSel = 0> {
   string dst = !if(HasDst,
@@ -2294,7 +2295,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
   string bytesel = !if(HasByteSel, "$byte_sel", "");
   string 3PMods = !if(IsVOP3P,
                       !if(HasOpSel, "$op_sel_hi", "")
-                        #!if(HasModifiers, "$neg_lo$neg_hi", ""),
+                        #!if(HasNeg, "$neg_lo$neg_hi", ""),
                       "");
   string clamp = !if(HasClamp, "$clamp", "");
   string omod = !if(HasOMod, "$omod", "");
@@ -2554,6 +2555,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
                                isModifierType<Src1VT>.ret,
                                isModifierType<Src2VT>.ret,
                                HasOMod);
+  field bit HasNeg = HasModifiers;
 
   field bit HasSrc0Mods = HasModifiers;
   field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
@@ -2589,7 +2591,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
                              HasClamp, HasModifiers, HasSrc2Mods,
                              HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
   field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
-                                   NumSrcArgs, HasClamp, HasOpSel,
+                                   NumSrcArgs, HasClamp, HasOpSel, HasNeg,
                                    Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
   field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
                                 NumSrcArgs, HasClamp, HasOMod,
@@ -2607,7 +2609,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
                   Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
                   Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel>.ret;
   defvar InsVOP3PDPPBase = getInsVOP3P<Src0VOP3DPP, Src1VOP3DPP,
-                  Src2VOP3DPP, NumSrcArgs, HasClamp, HasOpSel,
+                  Src2VOP3DPP, NumSrcArgs, HasClamp, HasOpSel, HasNeg,
                   Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP>.ret;
 
   field dag InsVOP3Base = !if(IsVOP3P, InsVOP3PDPPBase, InsVOP3DPPBase);
@@ -2635,10 +2637,10 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   // the asm operand name via this HasModifiers flag
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
   field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
-   HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
+   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasModifiers, HasModifiers,
    HasModifiers, DstVT, IsFP8ByteSel>.ret;
   field string Asm64 = AsmVOP3Base;
-  field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
+  field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
   field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
                                               HasClamp,
                                               HasOMod,
@@ -2774,6 +2776,7 @@ def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
 def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
 def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
 def VOP_I64_I64 : VOPProfile <[i64, i64, untyped, untyped]>;
+def VOP_F32_BF16 : VOPProfile <[f32, bf16, untyped, untyped]>;
 
 def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
 def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
@@ -2786,6 +2789,7 @@ def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
 def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
 def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
 
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
@@ -2844,6 +2848,11 @@ def VOP_V16I32_V2I32_V4I32_I32    : VOPProfile <[v16i32, v2i32, v4i32, i32]>;
 def VOP_V4F32_V2I32_V4I32_I32     : VOPProfile <[v4f32,  v2i32, v4i32, i32]>;
 def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
 
+def VOP_V4F32_V8F16_V8F16_V4F32   : VOPProfile <[v4f32,  v8f16, v8f16, v4f32]>;
+def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
+def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+
+
 class Commutable_REV <string revOp, bit isOrig> {
   string RevOp = revOp;
   bit IsOrig = isOrig;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index d47ff9fe96c94..46b2b4a389200 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1161,7 +1161,8 @@ def SSrcOrLds_b32 : SrcRegOrImm9 <SRegOrLds_32, "OPW32", "OPERAND_REG_IMM_INT32"
 class SrcRegOrImmDeferred9<RegisterClass regClass, string opWidth,
                            string operandType, int immWidth, int OperandSemantics>
     : RegOrImmOperand<regClass, operandType> {
-  let DecoderMethod = "decodeSrcRegOrImmDeferred9<AMDGPUDisassembler::" #
+  string DecoderMethodName = "decodeSrcRegOrImmDeferred9";
+  let DecoderMethod = DecoderMethodName # "<AMDGPUDisassembler::" #
                       opWidth # ", " # immWidth # ", " # OperandSemantics # ">";
 }
 
@@ -1222,6 +1223,13 @@ def VSrc_bf16_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW16", "OPERAND_REG_IMM_B
 def VSrc_f16_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW16", "OPERAND_REG_IMM_FP16_DEFERRED", 16, OperandSemantics.FP16>;
 def VSrc_f32_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW32", "OPERAND_REG_IMM_FP32_DEFERRED", 32, OperandSemantics.FP32>;
 
+// True 16 Operands
+def VSrcT_f16_Lo128_Deferred : SrcRegOrImmDeferred9<VS_16_Lo128, "OPW16",
+                                                   "OPERAND_REG_IMM_FP16_DEFERRED", 16, OperandSemantics.FP16> {
+  let DecoderMethodName = "decodeOperand_VSrcT16_Lo128_Deferred";
+  let EncoderMethod = "getMachineOpValueT16Lo128";
+}
+
 def VSrcFake16_bf16_Lo128_Deferred
   : SrcRegOrImmDeferred9<VS_32_Lo128, "OPW16", "OPERAND_REG_IMM_BF16_DEFERRED", 16, OperandSemantics.BF16>;
 def VSrcFake16_f16_Lo128_Deferred
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 01866fbd9da6e..501d00b1f308d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -916,6 +916,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
     return 32768;
   if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
     return 65536;
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
+    return 163840;
   return 0;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index c743eb43e3465..3cda173207dfb 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -311,6 +311,9 @@ let OtherPredicates = [UseRealTrue16Insts] in
 let OtherPredicates = [UseFakeTrue16Insts] in
   defm V_CVT_F32_F16_fake16 : VOP1Inst <"v_cvt_f32_f16_fake16", VOPProfile_Fake16<VOP_F32_F16>, any_fpextend>;
 
+let SubtargetPredicate = HasBF16ConversionInsts in
+defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
+
 let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
@@ -761,6 +764,9 @@ let SubtargetPredicate = isGFX11Plus in {
   defm V_CVT_U32_U16    : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
 } // End SubtargetPredicate = isGFX11Plus
 
+let SubtargetPredicate = HasPrngInst in
+defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>;
+
 foreach vt = Reg32Types.types in {
   def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
         (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
@@ -1268,7 +1274,7 @@ multiclass VOP1_Real_vi <bits<10> op> {
 
   if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then
   def _sdwa_vi :
-    VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP_SDWA8_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
@@ -1474,7 +1480,7 @@ def : GCNPat <
 // GFX9
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+let DecoderNamespace = "GFX9" in {
   multiclass VOP1_Real_gfx9 <bits<10> op> {
     defm NAME : VOP1_Real_e32e64_vi <op>;
 
@@ -1511,11 +1517,15 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 let AssemblerPredicate = isGFX940Plus in
 defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
 
+defm V_CVT_F32_BF16          : VOP1_Real_gfx9 <0x5b>;
+
 defm V_CVT_F32_FP8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>;
 defm V_CVT_F32_BF8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
 defm V_CVT_PK_F32_FP8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
 defm V_CVT_PK_F32_BF8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
 
+defm V_PRNG_B32            : VOP1_Real_gfx9 <0x58>;
+
 class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
   (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),
   (Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 925b60561c9d6..103575dc351f2 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -374,6 +374,12 @@ class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
 }
 
 def VOP_MADAK_F16 : VOP_MADAK <f16>;
+def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
+  let IsTrue16 = 1;
+  let IsRealTrue16 = 1;
+  let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret;
+  let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, VGPRSrc_16_Lo128:$src1, ImmOpType:$imm);
+}
 def VOP_MADAK_F16_fake16 : VOP_MADAK <f16> {
   let IsTrue16 = 1;
   let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
@@ -399,6 +405,12 @@ class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> {
 }
 
 def VOP_MADMK_F16 : VOP_MADMK <f16>;
+def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
+  let IsTrue16 = 1;
+  let IsRealTrue16 = 1;
+  let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret;
+  let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPRSrc_16_Lo128:$src1);
+}
 def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> {
   let IsTrue16 = 1;
   let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
@@ -467,6 +479,42 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
 }
 
 def VOP_MAC_F16 : VOP_MAC <f16>;
+def VOP_MAC_F16_t16 : VOP_MAC <f16> {
+  let IsTrue16 = 1;
+  let IsRealTrue16 = 1;
+  let HasOpSel = 1;
+  let DstRC = VOPDstOperand_t16Lo128;
+  let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret:$src2);
+  let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
+  let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
+  let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
+  let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+                    Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+                    getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret:$src2, // stub argument
+                    dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
+                    DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
+  let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+                     getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret:$src2, // stub argument
+                     dpp8:$dpp8, Dpp8FI:$fi);
+  let DstRC64 = getVALUDstForVT<DstVT, 1/*IsTrue*/, 1/*IsVOP3Encoding*/>.ret;
+  let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
+  let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
+  let Src0VOP3DPP = VGPRSrc_16;
+  let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
+  let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
+  let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
+  let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
+  let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+}
 def VOP_MAC_F16_fake16 : VOP_MAC <f16> {
   let IsTrue16 = 1;
   let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
@@ -766,16 +814,16 @@ defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "
 defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">;
 
 
-let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in { 
+let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
   defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32">;
   defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32">;
 }
 
-let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1, isAdd = 1 in { 
+let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1, isAdd = 1 in {
   defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32">;
 }
 
-let isAdd = 1 in { 
+let isAdd = 1 in {
   defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32">;
   defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
 }
@@ -998,6 +1046,9 @@ let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in {
 let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
   def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
 }
+let True16Predicate = UseRealTrue16Insts in {
+  def V_FMAMK_F16_t16 : VOP2_Pseudo <"v_fmamk_f16_t16", VOP_MADMK_F16_t16, [], "">;
+}
 let True16Predicate = UseFakeTrue16Insts in {
   def V_FMAMK_F16_fake16 : VOP2_Pseudo <"v_fmamk_f16_fake16", VOP_MADMK_F16_fake16, [], "">;
 }
@@ -1006,6 +1057,9 @@ let isCommutable = 1 in {
 let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
   def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
 }
+let True16Predicate = UseRealTrue16Insts in {
+  def V_FMAAK_F16_t16 : VOP2_Pseudo <"v_fmaak_f16_t16", VOP_MADAK_F16_t16, [], "">;
+}
 let True16Predicate = UseFakeTrue16Insts in {
   def V_FMAAK_F16_fake16 : VOP2_Pseudo <"v_fmaak_f16_fake16", VOP_MADAK_F16_fake16, [], "">;
 }
@@ -1020,6 +1074,9 @@ let SubtargetPredicate = isGFX10Plus in {
 let True16Predicate = NotHasTrue16BitInsts in {
   defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
 }
+let True16Predicate = UseRealTrue16Insts in {
+  defm V_FMAC_F16_t16 : VOP2Inst <"v_fmac_f16_t16", VOP_MAC_F16_t16>;
+}
 let True16Predicate = UseFakeTrue16Insts in {
   defm V_FMAC_F16_fake16 : VOP2Inst <"v_fmac_f16_fake16", VOP_MAC_F16_fake16>;
 }
@@ -1692,8 +1749,8 @@ multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, stri
   VOP3_Realtriple_t16_gfx12<op, asmName, OpName, "", /*IsSingle*/1>;
 
 multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> {
-  defm OpName#"_t16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_t16">;
-  defm OpName#"_fake16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_fake16">;
+  defm _t16: VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_t16">;
+  defm _fake16: VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_fake16">;
 }
 
 multiclass VOP3beOnly_Realtriple_gfx11_gfx12<bits<10> op> :
@@ -1712,7 +1769,14 @@ multiclass VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<bits<6> op, string asmN
 
 multiclass VOP2_Real_FULL_t16_gfx11_gfx12<bits<6> op, string asmName,
                                           string opName = NAME> :
-  VOP2_Real_FULL_with_name_gfx11_gfx12<op, opName, asmName>;
+  VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
+  VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_t16_and_fake16_gfx11_gfx12<bits<6> op, string asmName,
+                                          string opName = NAME> {
+  defm _t16:    VOP2_Real_FULL_t16_gfx11_gfx12<op, asmName, opName#"_t16">;
+  defm _fake16: VOP2_Real_FULL_t16_gfx11_gfx12<op, asmName, opName#"_fake16">;
+}
 
 multiclass VOP2_Real_FULL_gfx11<bits<6> op> :
   VOP2_Real_FULL<GFX11Gen, op>;
@@ -1747,15 +1811,15 @@ defm V_SUBREV_F16_t16      : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16
 defm V_SUBREV_F16_fake16   : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">;
 defm V_MUL_F16_t16         : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
 defm V_MUL_F16_fake16      : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
-defm V_FMAC_F16_fake16     : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
+defm V_FMAC_F16            : VOP2_Real_FULL_t16_and_fake16_gfx11_gfx12<0x036, "v_fmac_f16">;
 defm V_LDEXP_F16_t16       : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
 defm V_LDEXP_F16_fake16    : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
 defm V_MAX_F16_t16         : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
 defm V_MAX_F16_fake16      : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
 defm V_MIN_F16_t16         : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
 defm V_MIN_F16_fake16      : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
-defm V_FMAMK_F16_fake16    : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x037, "v_fmamk_f16">;
-defm V_FMAAK_F16_fake16    : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x038, "v_fmaak_f16">;
+defm V_FMAMK_F16           : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x037, "v_fmamk_f16">;
+defm V_FMAAK_F16           : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x038, "v_fmaak_f16">;
 
 // VOP3 only.
 defm V_CNDMASK_B16         : VOP3Only_Realtriple_gfx11_gfx12<0x25d>;
@@ -2290,10 +2354,10 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
 
 } // End AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8"
 
-multiclass VOP2_SDWA_Real <bits<6> op> {
+multiclass VOP2_SDWA8_Real <bits<6> op> {
   if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then
   def _sdwa_vi :
-    VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP_SDWA8_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 }
 
@@ -2321,7 +2385,7 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
     }
   if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then
     def _sdwa_vi :
-      VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+      VOP_SDWA8_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
       VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
         VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
         let AsmString = AsmName # ps.AsmOperands;
@@ -2337,7 +2401,7 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
 
 } // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8"
 
-let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+let DecoderNamespace = "GFX9" in {
 
 multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
   def _e32_gfx9 :
@@ -2386,10 +2450,10 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
       VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
-} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
+} // End DecoderNamespace = "GFX9"
 
 multiclass VOP2_Real_e32e64_vi <bits<6> op> :
-  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
+  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA8_Real<op>, VOP2_SDWA9_Real<op> {
 
   if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
     def _dpp_vi :
@@ -2401,7 +2465,7 @@ defm V_CNDMASK_B32        : VOP2_Real_e32e64_vi <0x0>;
 defm V_ADD_F32            : VOP2_Real_e32e64_vi <0x1>;
 defm V_SUB_F32            : VOP2_Real_e32e64_vi <0x2>;
 defm V_SUBREV_F32         : VOP2_Real_e32e64_vi <0x3>;
-let AssemblerPredicate = isGCN3ExcludingGFX90A in
+let OtherPredicates = [isGCN3ExcludingGFX90A] in
 defm V_MUL_LEGACY_F32     : VOP2_Real_e32e64_vi <0x4>;
 defm V_MUL_F32            : VOP2_Real_e32e64_vi <0x5>;
 defm V_MUL_I32_I24        : VOP2_Real_e32e64_vi <0x6>;
@@ -2431,6 +2495,7 @@ defm V_ADDC_U32           : VOP2be_Real_e32e64_vi_only <0x1c, "V_ADDC_U32",    "
 defm V_SUBB_U32           : VOP2be_Real_e32e64_vi_only <0x1d, "V_SUBB_U32",    "v_subb_u32">;
 defm V_SUBBREV_U32        : VOP2be_Real_e32e64_vi_only <0x1e, "V_SUBBREV_U32", "v_subbrev_u32">;
 
+let AssemblerPredicate = isGFX9Only in {
 defm V_ADD_CO_U32         : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_CO_U32",     "v_add_co_u32">;
 defm V_SUB_CO_U32         : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_CO_U32",     "v_sub_co_u32">;
 defm V_SUBREV_CO_U32      : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_CO_U32",  "v_subrev_co_u32">;
@@ -2441,6 +2506,7 @@ defm V_SUBBREV_CO_U32     : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s
 defm V_ADD_U32            : VOP2_Real_e32e64_gfx9 <0x34>;
 defm V_SUB_U32            : VOP2_Real_e32e64_gfx9 <0x35>;
 defm V_SUBREV_U32         : VOP2_Real_e32e64_gfx9 <0x36>;
+} // End AssemblerPredicate = isGFX9Only
 
 defm V_BFM_B32            : VOP2_Real_e64only_vi <0x293>;
 defm V_BCNT_U32_B32       : VOP2_Real_e64only_vi <0x28b>;
@@ -2518,7 +2584,7 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
 
 } // End SubtargetPredicate = HasDLInsts
 
-let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in {
+let DecoderNamespace = "GFX90A" in {
   multiclass VOP2_Real_e32_gfx90a <bits<6> op> {
     def _e32_gfx90a :
       VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX90A>,
@@ -2551,7 +2617,7 @@ let SubtargetPredicate = HasFmacF64Inst in {
   defm V_FMAC_F64       : VOP2_Real_e32e64_gfx90a <0x4>;
 } // End SubtargetPredicate = HasFmacF64Inst
 
-let SubtargetPredicate = isGFX90APlus, IsSingle = 1 in {
+let IsSingle = 1 in {
   defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 34ecdb56e8689..917e1b3974b46 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -226,7 +226,7 @@ let mayRaiseFPException = 0 in {
   defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
 } // End mayRaiseFPException = 0
 
-let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in {
   defm V_MINIMUM3_F32 : VOP3Inst <"v_minimum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfminimum3>;
   defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
@@ -625,7 +625,7 @@ defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3
 defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
 defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
 
-let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
   defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
   defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
@@ -944,6 +944,30 @@ let SubtargetPredicate = isGFX11Plus in {
   defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
 } // End SubtargetPredicate = isGFX11Plus
 
+// FIXME: GlobalISel cannot distinguish f16 and bf16 and may start using bf16 patterns
+//        instead of less complex f16. Disable GlobalISel for these for now.
+def bf16_fpround : PatFrag <(ops node:$src0),  (fpround $src0), [{ return true; }]> {
+  let GISelPredicateCode = [{return false;}];
+}
+
+let SubtargetPredicate = HasBF16ConversionInsts in {
+  let ReadsModeReg = 0 in {
+    defm V_CVT_PK_BF16_F32    : VOP3Inst<"v_cvt_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32>>;
+  }
+  def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)),
+               (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>;
+  def : GCNPat<(v2bf16 (bf16_fpround v2f64:$src)),
+               (V_CVT_PK_BF16_F32_e64 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG VReg_128:$src, sub0_sub1)),
+                                      0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG VReg_128:$src, sub2_sub3)))>;
+  def : GCNPat<(v2bf16 (build_vector (bf16 (bf16_fpround (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+                                     (bf16 (bf16_fpround (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+               (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+  def : GCNPat<(bf16 (bf16_fpround (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+               (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>;
+  def : GCNPat<(bf16 (bf16_fpround (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)))),
+               (V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 $src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>;
+}
+
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
   defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -1721,5 +1745,6 @@ defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
 
 defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
 defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
+defm V_CVT_PK_BF16_F32: VOP3OpSel_Real_gfx9 <0x268>;
 defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
 defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index e246d433401f9..876d4e1acf596 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -20,6 +20,11 @@ class VOP3P_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
   let AsmVOP3Base = AsmVOP3P;
 }
 
+def VOP_MFMA_LD_SCALE : VOP3P_Profile<VOPProfile<[untyped, i32, i32, untyped]>, VOP3P_LD_SCALE> {
+  let HasModifiers = 1;
+  let HasNeg = 0;
+}
+
 // Used for FMA_MIX* and MAD_MIX* insts
 // Their operands are only sort of f16 operands. Depending on
 // op_sel_hi, these may be interpreted as f32. The inline immediate
@@ -626,6 +631,14 @@ def VOPProfileSMFMAC_I32_32X32X32_I8  : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_
 def VOPProfileSMFMAC_F32_16X16X64_F8  : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
 def VOPProfileSMFMAC_F32_32X32X32_F8  : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
 
+def VOPProfileMAI_F32_V8F16_X32     : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32,       AISrc_128_f32,  ADst_128, AVSrc_128>;
+def VOPProfileMAI_F32_V8F16_X32_VCD : VOPProfileMAI<VOP_V4F32_V8F16_V8F16_V4F32,       VISrc_128_f32,  VDst_128, AVSrc_128>;
+def VOPProfileMAI_F32_V8F16_X16     : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8F16_V8F16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
+
+def VOPProfileMAI_F32_V8BF16_X16     : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     AISrc_512_f32,  ADst_512, AVSrc_128>;
+def VOPProfileMAI_F32_V8BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V8BF16_V8BF16_V16F32,     VISrc_512_f32,  VDst_512, AVSrc_128>;
+
 class MFMATable <bit is_mac, string Name> {
   bit IsMac = is_mac;
   string FMAOp = Name;
@@ -661,8 +674,6 @@ class VgprMAIFrag<SDPatternOperator Op> : MAIFrag<Op, MayNotNeedAGPRs> {
   let GISelPredicateCode = MayNotNeedAGPRs_gisel;
 }
 
-let SubtargetPredicate = HasMAIInsts in {
-
 let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
   defm V_ACCVGPR_READ_B32  : VOP3Inst<"v_accvgpr_read_b32",  VOPProfileAccRead>;
   let isMoveImm = 1 in {
@@ -672,6 +683,7 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 
 class MAIInst<string OpName, VOPProfile P, SDPatternOperator node>
   : VOP3InstBase<OpName, P, node> {
+  let SubtargetPredicate = HasMAIInsts;
   Instruction Opcode = !cast<Instruction>(NAME);
   bit is_dgemm = 0;
   bit is_gfx940_xdl = 0;
@@ -687,7 +699,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
                          !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node>)>,
                  MFMATable<0, NAME # "_e64">;
 
-      let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
+      let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in
       def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
                                 !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node>)>,
                         MFMATable<0, NAME # "_vgprcd_e64">;
@@ -701,7 +713,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
                                  !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node>)>,
                          MFMATable<1, NAME # "_e64">;
 
-        let SubtargetPredicate = isGFX90APlus in
+        let OtherPredicates = [isGFX90APlus] in
         def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"),
                                       !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node>)>,
                               MFMATable<1, NAME # "_vgprcd_e64">;
@@ -727,7 +739,7 @@ defm V_MFMA_F32_32X32X8F16  : MAIInst<"v_mfma_f32_32x32x8f16",  "F32_V4F16_X16",
 defm V_MFMA_I32_32X32X4I8   : MAIInst<"v_mfma_i32_32x32x4i8",   "I32_I32_X32",   int_amdgcn_mfma_i32_32x32x4i8>;
 }
 
-let Predicates = [isGFX908orGFX90A] in {
+let SubtargetPredicate = isGFX908orGFX90A in {
 defm V_MFMA_I32_16X16X16I8  : MAIInst<"v_mfma_i32_16x16x16i8",  "I32_I32_X4",    int_amdgcn_mfma_i32_16x16x16i8>;
 defm V_MFMA_I32_32X32X8I8   : MAIInst<"v_mfma_i32_32x32x8i8",   "I32_I32_X16",   int_amdgcn_mfma_i32_32x32x8i8>;
 defm V_MFMA_F32_4X4X2BF16   : MAIInst<"v_mfma_f32_4x4x2bf16",   "F32_V2I16_X4",  int_amdgcn_mfma_f32_4x4x2bf16>;
@@ -737,9 +749,17 @@ defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32",
 defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
 }
 
-} // End SubtargetPredicate = HasMAIInsts
+let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
+defm V_MFMA_F32_16X16X32_F16   : MAIInst<"v_mfma_f32_16x16x32f16",    "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
+defm V_MFMA_F32_32X32X16_F16   : MAIInst<"v_mfma_f32_32x32x16f16",    "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_F32_32X32X16_BF16  : MAIInst<"v_mfma_f32_32x32x16bf16",   "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
+}
 
-let Predicates = [isGFX90APlus] in {
+let SubtargetPredicate = HasGFX950Insts in {
+defm V_MFMA_LD_SCALE_B32 : VOP3PInst<"v_mfma_ld_scale_b32", VOP_MFMA_LD_SCALE>;
+}
+
+let SubtargetPredicate = isGFX90APlus in {
   let is_gfx940_xdl = 1 in {
   defm V_MFMA_F32_32X32X4BF16_1K  : MAIInst<"v_mfma_f32_32x32x4bf16_1k",  "F32_V4I16_X32",  int_amdgcn_mfma_f32_32x32x4bf16_1k>;
   defm V_MFMA_F32_16X16X4BF16_1K  : MAIInst<"v_mfma_f32_16x16x4bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_16x16x4bf16_1k>;
@@ -752,7 +772,7 @@ let Predicates = [isGFX90APlus] in {
   defm V_MFMA_F64_16X16X4F64      : MAIInst<"v_mfma_f64_16x16x4f64",      "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>;
   defm V_MFMA_F64_4X4X4F64        : MAIInst<"v_mfma_f64_4x4x4f64",        "F64_4X4X4F64",   int_amdgcn_mfma_f64_4x4x4f64>;
   }
-} // End Predicates = [isGFX90APlus]
+} // End SubtargetPredicate = isGFX90APlus
 
 let SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 in {
   defm V_MFMA_I32_32X32X16I8       : MAIInst<"v_mfma_i32_32x32x16i8",       "I32_I64_X32",    int_amdgcn_mfma_i32_32x32x16_i8>;
@@ -1618,14 +1638,17 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string
                                           VOPProfile Pfl_ACD = PS_ACD.Pfl,
                                           VOPProfile Pfl_VCD = PS_VCD.Pfl> {
   if !ne(NameFrom, NameTo) then {
-    def : InstAlias <NameTo # " " # PS_ACD.AsmOperands,
+    let SubtargetPredicate = PS_ACD.SubtargetPredicate,
+        OtherPredicates = PS_ACD.OtherPredicates in {
+      def : InstAlias <NameTo # " " # PS_ACD.AsmOperands,
                      (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst,
                          Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2,
                          CBSZ:$cbsz, ABID:$abid, blgp:$blgp)>, PredicateControl;
-    def : InstAlias <NameTo # " " # PS_VCD.AsmOperands,
+      def : InstAlias <NameTo # " " # PS_VCD.AsmOperands,
                      (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst,
                          Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2,
                          CBSZ:$cbsz, ABID:$abid, blgp:$blgp)>, PredicateControl;
+    }
   }
 }
 
@@ -1642,7 +1665,10 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N
                     VOP3Pe_MAI <op, PS_VCD.Pfl, 0>;
   } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940"
 
-  let SubtargetPredicate = isGFX940Plus in {
+  let SubtargetPredicate = PS_ACD.SubtargetPredicate,
+      OtherPredicates = PS_ACD.OtherPredicates,
+      AssemblerPredicate = isGFX940Plus
+      in {
     defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>;
 
     if !ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic) then
@@ -1650,6 +1676,16 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N
   }
 }
 
+multiclass VOP3P_Real_MFMA_gfx950<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
+                                  VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
+                                  VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
+    let SubtargetPredicate = HasGFX950Insts,
+        AssemblerPredicate = HasGFX950Insts in {
+    defm "" : VOP3P_Real_MFMA_gfx940<op, Name, PS_ACD, PS_VCD>;
+  }
+}
+
+
 multiclass VOP3P_Real_MFMA_vi<bits<7> op> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
             VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
@@ -1679,7 +1715,6 @@ multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> {
   }
 }
 
-let SubtargetPredicate = isGFX8GFX9 in {
 defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
 defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>;
 defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>;
@@ -1701,11 +1736,9 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>;
 defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>;
 defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>;
 
-let OtherPredicates = [HasMadMixInsts] in {
 defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>;
 defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>;
 defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>;
-}
 
 let OtherPredicates = [HasFmaMixInsts],
     DecoderNamespace = "GFX9_DL" in {
@@ -1726,9 +1759,6 @@ defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x2b>;
 
 defm V_DOT4_I32_I8  : VOP3P_Real_vi <0x28>;
 defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x2a>;
-} // End SubtargetPredicate = isGFX8GFX9
-
-let OtherPredicates = [HasMAIInsts] in {
 
 defm V_ACCVGPR_READ_B32  : VOP3P_Real_MAI <0x58>;
 defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>;
@@ -1754,8 +1784,6 @@ defm V_MFMA_F32_4X4X2BF16   : VOP3P_Real_MFMA_vi_gfx90a <0x6b>;
 defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6c>;
 defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6d>;
 
-} // End OtherPredicates = [HasMAIInsts]
-
 defm V_MFMA_F32_32X32X4BF16_1K  : VOP3P_Real_MFMA_gfx90a <0x63>;
 defm V_MFMA_F32_16X16X4BF16_1K  : VOP3P_Real_MFMA_gfx90a <0x64>;
 defm V_MFMA_F32_4X4X4BF16_1K    : VOP3P_Real_MFMA_gfx90a <0x65>;
@@ -1764,13 +1792,17 @@ defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
 defm V_MFMA_F64_16X16X4F64      : VOP3P_Real_MFMA_gfx90a <0x6e>;
 defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
+defm V_MFMA_F32_16X16X32_F16     : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
+defm V_MFMA_F32_32X32X16_F16     : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_F32_32X32X16_BF16    : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
+
+defm V_MFMA_LD_SCALE_B32 : VOP3P_Real_vi <0x2c>;
+
 defm V_MFMA_I32_32X32X16I8       : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
 defm V_MFMA_I32_16X16X32I8       : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
-let SubtargetPredicate = HasXF32Insts in {
 defm V_MFMA_F32_16X16X8XF32      : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
 defm V_MFMA_F32_32X32X4XF32      : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
-} // End SubtargetPredicate = HasXF32Insts
-let SubtargetPredicate = HasFP8Insts in {
+
 defm V_MFMA_F32_16X16X32_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x70>;
 defm V_MFMA_F32_16X16X32_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x71>;
 defm V_MFMA_F32_16X16X32_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x72>;
@@ -1779,7 +1811,6 @@ defm V_MFMA_F32_32X32X16_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x74>;
 defm V_MFMA_F32_32X32X16_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x75>;
 defm V_MFMA_F32_32X32X16_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x76>;
 defm V_MFMA_F32_32X32X16_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x77>;
-} // End SubtargetPredicate = HasFP8Insts
 
 defm V_MFMA_F32_32X32X4BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
 defm V_MFMA_F32_16X16X4BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
@@ -1796,7 +1827,6 @@ defm V_SMFMAC_F32_16X16X32_BF16    : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x1
 defm V_SMFMAC_F32_32X32X16_BF16    : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">;
 defm V_SMFMAC_I32_16X16X64_I8      : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">;
 defm V_SMFMAC_I32_32X32X32_I8      : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">;
-let SubtargetPredicate = HasFP8Insts in {
 defm V_SMFMAC_F32_16X16X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x78, "v_smfmac_f32_16x16x64bf8bf8">;
 defm V_SMFMAC_F32_16X16X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x79, "v_smfmac_f32_16x16x64bf8fp8">;
 defm V_SMFMAC_F32_16X16X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x7a, "v_smfmac_f32_16x16x64fp8bf8">;
@@ -1805,7 +1835,6 @@ defm V_SMFMAC_F32_32X32X32_BF8_BF8 : VOP3P_Real_SMFMAC <0x7c, "v_smfmac_f32_32x3
 defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x32x32bf8fp8">;
 defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">;
 defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">;
-} // End SubtargetPredicate = HasFP8Insts
 
 defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
 defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index d6e08dce130ce..f4ccae1decb1d 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -2290,7 +2290,7 @@ multiclass VOPC_Real_vi <bits<10> op> {
 
   if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then
   def _sdwa_vi :
-    VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP_SDWA8_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index aab5dc7465d93..a6e6adac04e5a 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -423,7 +423,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   bits<2> index_key_8bit;
   bits<1> index_key_16bit;
 
-  let Inst{7-0} = vdst;
+  let Inst{7-0} = !if(P.HasDst, vdst, 0);
   let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
   let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
   let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
@@ -650,7 +650,6 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 
   let SubtargetPredicate = HasSDWA;
-  let AssemblerPredicate = HasSDWA;
   let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA,
                                          AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "GFX8";
@@ -658,7 +657,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   VOPProfile Pfl = P;
 }
 
-class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
+class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> {
 
@@ -676,7 +675,7 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
 
   // Copy relevant pseudo op flags
   let SubtargetPredicate   = ps.SubtargetPredicate;
-  let AssemblerPredicate   = ps.AssemblerPredicate;
+  let AssemblerPredicate   = HasSDWA8;
   let AsmMatchConverter    = ps.AsmMatchConverter;
   let AsmVariantName       = ps.AsmVariantName;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
@@ -708,7 +707,7 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   let Constraints     = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
 
-  let SubtargetPredicate = HasSDWA9;
+  let SubtargetPredicate = ps.SubtargetPredicate;
   let AssemblerPredicate = HasSDWA9;
   let OtherPredicates    = ps.OtherPredicates;
   let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9,
@@ -735,7 +734,7 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9>;
 
 class Base_VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> : Base_VOP_SDWA9_Real<ps> {
-  let SubtargetPredicate = HasSDWA10;
+  let SubtargetPredicate = ps.SubtargetPredicate;
   let AssemblerPredicate = HasSDWA10;
   let DecoderNamespace = "GFX10";
 }
@@ -1366,6 +1365,10 @@ def VOP3_OPSEL   : VOP3Features<1, 1, 0, 0>;
 def VOP3_PACKED  : VOP3Features<1, 1, 1, 0>;
 def VOP3_MAI     : VOP3Features<0, 0, 0, 1>;
 
+// Packed is misleading, but it enables the appropriate op_sel
+// modifiers.
+def VOP3P_LD_SCALE : VOP3Features<0, 1, 1, 0>;
+
 class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
 
   let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
@@ -1508,7 +1511,7 @@ class VOP3_DPP16_t16_Helper<bits<10> op, VOP_DPP_Pseudo ps,
   let SchedRW = ps.SchedRW;
   let Uses = ps.Uses;
   let AssemblerPredicate = HasDPP16;
-  let SubtargetPredicate = HasDPP16;
+  let SubtargetPredicate = ps.SubtargetPredicate;
   let OtherPredicates = ps.OtherPredicates;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 8e79a0a344067..3fda15a429017 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2590,14 +2590,14 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       return true;
     }
 
-    case ARM::MOVsrl_glue:
-    case ARM::MOVsra_glue: {
+    case ARM::LSRs1:
+    case ARM::ASRs1: {
       // These are just fancy MOVs instructions.
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
               MI.getOperand(0).getReg())
           .add(MI.getOperand(1))
           .addImm(ARM_AM::getSORegOpc(
-              (Opcode == ARM::MOVsrl_glue ? ARM_AM::lsr : ARM_AM::asr), 1))
+              (Opcode == ARM::LSRs1 ? ARM_AM::lsr : ARM_AM::asr), 1))
           .add(predOps(ARMCC::AL))
           .addReg(ARM::CPSR, RegState::Define);
       MI.eraseFromParent();
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 2f24006c198a2..f495aa701e875 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -177,6 +177,7 @@ enum class SpillArea {
   GPRCS2,
   DPRCS1,
   DPRCS2,
+  GPRCS3,
   FPCXT,
 };
 
@@ -197,7 +198,7 @@ SpillArea getSpillArea(Register Reg,
   // SplitR11WindowsSEH:
   // push {r0-r10, r12}  GPRCS1
   // vpush {r8-d15}      DPRCS1
-  // push {r11, lr}      GPRCS2
+  // push {r11, lr}      GPRCS3
   //
   // SplitR11AAPCSSignRA:
   // push {r0-r10, r12}  GPRSC1
@@ -238,10 +239,13 @@ SpillArea getSpillArea(Register Reg,
       return SpillArea::GPRCS1;
 
   case ARM::R11:
-    if (Variation == ARMSubtarget::NoSplit)
-      return SpillArea::GPRCS1;
-    else
+    if (Variation == ARMSubtarget::SplitR7 ||
+        Variation == ARMSubtarget::SplitR11AAPCSSignRA)
       return SpillArea::GPRCS2;
+    if (Variation == ARMSubtarget::SplitR11WindowsSEH)
+      return SpillArea::GPRCS3;
+
+    return SpillArea::GPRCS1;
 
   case ARM::R12:
     if (Variation == ARMSubtarget::SplitR7)
@@ -250,11 +254,12 @@ SpillArea getSpillArea(Register Reg,
       return SpillArea::GPRCS1;
 
   case ARM::LR:
-    if (Variation == ARMSubtarget::SplitR11WindowsSEH ||
-        Variation == ARMSubtarget::SplitR11AAPCSSignRA)
+    if (Variation == ARMSubtarget::SplitR11AAPCSSignRA)
       return SpillArea::GPRCS2;
-    else
-      return SpillArea::GPRCS1;
+    if (Variation == ARMSubtarget::SplitR11WindowsSEH)
+      return SpillArea::GPRCS3;
+
+    return SpillArea::GPRCS1;
 
   case ARM::D0:
   case ARM::D1:
@@ -912,7 +917,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
 
   // Determine the sizes of each callee-save spill areas and record which frame
   // belongs to which callee-save spill areas.
-  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCS1Size = 0, GPRCS3Size = 0,
+           DPRCS2Size = 0;
   int FramePtrSpillFI = 0;
   int D8SpillFI = 0;
 
@@ -970,14 +976,19 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       GPRCS2Size += 4;
       break;
     case SpillArea::DPRCS1:
-      DPRCSSize += 8;
+      DPRCS1Size += 8;
+      break;
+    case SpillArea::GPRCS3:
+      GPRCS3Size += 4;
       break;
     case SpillArea::DPRCS2:
+      DPRCS2Size += 4;
       break;
     }
   }
 
-  MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push;
+  MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push,
+                              DPRCS1Push, GPRCS3Push;
 
   // Move past the PAC computation.
   if (AFI->shouldSignReturnAddress())
@@ -1012,20 +1023,14 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned FPCXTOffset = NumBytes - ArgRegsSaveSize - FPCXTSaveSize;
   unsigned GPRCS1Offset = FPCXTOffset - GPRCS1Size;
   unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size;
-  Align DPRAlign = DPRCSSize ? std::min(Align(8), Alignment) : Align(4);
-  unsigned DPRGapSize = GPRCS1Size + FPCXTSaveSize + ArgRegsSaveSize;
-  if (PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) {
-    DPRGapSize += GPRCS2Size;
-  }
-  DPRGapSize %= DPRAlign.value();
 
-  unsigned DPRCSOffset;
-  if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
-    DPRCSOffset = GPRCS1Offset - DPRGapSize - DPRCSSize;
-    GPRCS2Offset = DPRCSOffset - GPRCS2Size;
-  } else {
-    DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
-  }
+  Align DPRAlign = DPRCS1Size ? std::min(Align(8), Alignment) : Align(4);
+  unsigned DPRGapSize =
+      (ArgRegsSaveSize + FPCXTSaveSize + GPRCS1Size + GPRCS2Size) %
+      DPRAlign.value();
+
+  unsigned DPRCS1Offset = GPRCS2Offset - DPRGapSize - DPRCS1Size;
+
   if (HasFP) {
     // Offset from the CFA to the saved frame pointer, will be negative.
     [[maybe_unused]] int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
@@ -1038,11 +1043,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   }
   AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
   AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
-  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+  AFI->setDPRCalleeSavedArea1Offset(DPRCS1Offset);
 
-  // Move GPRCS2, unless using SplitR11WindowsSEH, in which case it will be
-  // after DPRCS1.
-  if (GPRCS2Size > 0 && PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) {
+  // Move past area 2.
+  if (GPRCS2Size > 0) {
+    assert(PushPopSplit != ARMSubtarget::SplitR11WindowsSEH);
     GPRCS2Push = LastPush = MBBI++;
     DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush);
     if (FramePtrSpillArea == SpillArea::GPRCS2)
@@ -1063,19 +1068,19 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-  // Move past DPRCS1.
-  if (DPRCSSize > 0) {
+  // Move past DPRCS1Size.
+  if (DPRCS1Size > 0) {
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VSTMDDB_UPD) {
       DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI),
                                      BeforeFPPush);
-      LastPush = MBBI++;
+      DPRCS1Push = LastPush = MBBI++;
     }
   }
 
   // Move past the aligned DPRCS2 area.
-  if (AFI->getNumAlignedDPRCS2Regs() > 0) {
+  if (DPRCS2Size > 0) {
     MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs());
     // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and
     // leaves the stack pointer pointing to the DPRCS2 area.
@@ -1083,13 +1088,14 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     // Adjust NumBytes to represent the stack slots below the DPRCS2 area.
     NumBytes += MFI.getObjectOffset(D8SpillFI);
   } else
-    NumBytes = DPRCSOffset;
-
-  // Move GPRCS2, if using using SplitR11WindowsSEH.
-  if (GPRCS2Size > 0 && PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
-    GPRCS2Push = LastPush = MBBI++;
-    DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush);
-    if (FramePtrSpillArea == SpillArea::GPRCS2)
+    NumBytes = DPRCS1Offset;
+
+  // Move GPRCS3, if using using SplitR11WindowsSEH.
+  if (GPRCS3Size > 0) {
+    assert(PushPopSplit == ARMSubtarget::SplitR11WindowsSEH);
+    GPRCS3Push = LastPush = MBBI++;
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS3Size, BeforeFPPush);
+    if (FramePtrSpillArea == SpillArea::GPRCS3)
       BeforeFPPush = false;
   }
 
@@ -1211,11 +1217,18 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) +
                           ArgRegsSaveSize + FPCXTSaveSize + GPRCS1Size +
                           sizeOfSPAdjustment(*FPPushInst);
-      if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH)
-        FPOffsetAfterPush += DPRCSSize + DPRGapSize;
       LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS2, offset "
                         << FPOffsetAfterPush << "  after that push\n");
       break;
+    case SpillArea::GPRCS3:
+      FPPushInst = GPRCS3Push;
+      FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) +
+                          ArgRegsSaveSize + FPCXTSaveSize + GPRCS1Size +
+                          GPRCS2Size + DPRCS1Size + DPRGapSize +
+                          sizeOfSPAdjustment(*FPPushInst);
+      LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS3, offset "
+                        << FPOffsetAfterPush << "  after that push\n");
+      break;
     default:
       llvm_unreachable("frame pointer in unknown spill area");
       break;
@@ -1279,7 +1292,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
         CFIPos = std::next(GPRCS2Push);
         break;
       case SpillArea::DPRCS1:
-        CFIPos = std::next(LastPush);
+        CFIPos = std::next(DPRCS1Push);
+        break;
+      case SpillArea::GPRCS3:
+        CFIPos = std::next(GPRCS3Push);
         break;
       case SpillArea::FPCXT:
       case SpillArea::DPRCS2:
@@ -1317,7 +1333,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
   AFI->setDPRCalleeSavedGapSize(DPRGapSize);
-  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+  AFI->setDPRCalleeSavedArea1Size(DPRCS1Size);
+  AFI->setGPRCalleeSavedArea3Size(GPRCS3Size);
 
   // If we need dynamic stack realignment, do it here. Be paranoid and make
   // sure if we also have VLAs, we have a base pointer for frame access.
@@ -1438,12 +1455,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     }
 
     // Move SP to start of FP callee save spill area.
-    NumBytes -= (ReservedArgStack +
-                 AFI->getFPCXTSaveAreaSize() +
-                 AFI->getGPRCalleeSavedArea1Size() +
-                 AFI->getGPRCalleeSavedArea2Size() +
-                 AFI->getDPRCalleeSavedGapSize() +
-                 AFI->getDPRCalleeSavedAreaSize());
+    NumBytes -=
+        (ReservedArgStack + AFI->getFPCXTSaveAreaSize() +
+         AFI->getGPRCalleeSavedArea1Size() + AFI->getGPRCalleeSavedArea2Size() +
+         AFI->getDPRCalleeSavedGapSize() + AFI->getDPRCalleeSavedArea1Size() +
+         AFI->getGPRCalleeSavedArea3Size());
 
     // Reset SP based on frame pointer only if the stack frame extends beyond
     // frame pointer stack slot or target is ELF and the function has FP.
@@ -1491,11 +1507,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                    MachineInstr::FrameDestroy);
 
     // Increment past our save areas.
-    if (AFI->getGPRCalleeSavedArea2Size() &&
-        PushPopSplit == ARMSubtarget::SplitR11WindowsSEH)
+    if (AFI->getGPRCalleeSavedArea3Size()) {
+      assert(PushPopSplit == ARMSubtarget::SplitR11WindowsSEH);
+      (void)PushPopSplit;
       MBBI++;
+    }
 
-    if (MBBI != MBB.end() && AFI->getDPRCalleeSavedAreaSize()) {
+    if (MBBI != MBB.end() && AFI->getDPRCalleeSavedArea1Size()) {
       MBBI++;
       // Since vpop register list cannot have gaps, there may be multiple vpop
       // instructions in the epilogue.
@@ -1509,9 +1527,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                    MachineInstr::FrameDestroy);
     }
 
-    if (AFI->getGPRCalleeSavedArea2Size() &&
-        PushPopSplit != ARMSubtarget::SplitR11WindowsSEH)
+    if (AFI->getGPRCalleeSavedArea2Size()) {
+      assert(PushPopSplit != ARMSubtarget::SplitR11WindowsSEH);
+      (void)PushPopSplit;
       MBBI++;
+    }
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
 
     if (ReservedArgStack || IncomingArgStackToRestore) {
@@ -2128,19 +2148,14 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(
   auto IsDPRCS1 = [&CheckRegArea](unsigned Reg) {
     return CheckRegArea(Reg, SpillArea::DPRCS1);
   };
+  auto IsGPRCS3 = [&CheckRegArea](unsigned Reg) {
+    return CheckRegArea(Reg, SpillArea::GPRCS3);
+  };
 
-  // Windows SEH requires the floating-point registers to be pushed between the
-  // two blocks of GPRs in some situations. In all other cases, they are pushed
-  // below the GPRs.
-  if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
-    emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, IsGPRCS1);
-    emitPushInst(MBB, MI, CSI, FltOpc, 0, true, IsDPRCS1);
-    emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, IsGPRCS2);
-  } else {
-    emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, IsGPRCS1);
-    emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, IsGPRCS2);
-    emitPushInst(MBB, MI, CSI, FltOpc, 0, true, IsDPRCS1);
-  }
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, IsGPRCS1);
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, IsGPRCS2);
+  emitPushInst(MBB, MI, CSI, FltOpc, 0, true, IsDPRCS1);
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, IsGPRCS3);
 
   // The code above does not insert spill code for the aligned DPRCS2 registers.
   // The stack realignment code will be inserted between the push instructions
@@ -2190,16 +2205,14 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(
   auto IsDPRCS1 = [&CheckRegArea](unsigned Reg) {
     return CheckRegArea(Reg, SpillArea::DPRCS1);
   };
+  auto IsGPRCS3 = [&CheckRegArea](unsigned Reg) {
+    return CheckRegArea(Reg, SpillArea::GPRCS3);
+  };
 
-  if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
-    emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, IsGPRCS2);
-    emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, IsDPRCS1);
-    emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, IsGPRCS1);
-  } else {
-    emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, IsDPRCS1);
-    emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, IsGPRCS2);
-    emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, IsGPRCS1);
-  }
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, IsGPRCS3);
+  emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, IsDPRCS1);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, IsGPRCS2);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, IsGPRCS1);
 
   return true;
 }
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 5c45e081e1b16..73ee8cf81adcd 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -710,8 +710,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
         Base = CurDAG->getTargetFrameIndex(
             FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
-      OffImm = CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32,
-                                         /*isTarget=*/true);
+      OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
     }
   }
@@ -880,8 +879,7 @@ bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
   if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
     if (AddSub == ARM_AM::sub) Val *= -1;
     Offset = CurDAG->getRegister(0, MVT::i32);
-    Opc =
-        CurDAG->getSignedConstant(Val, SDLoc(Op), MVT::i32, /*isTarget*/ true);
+    Opc = CurDAG->getSignedTargetConstant(Val, SDLoc(Op), MVT::i32);
     return true;
   }
 
@@ -1185,8 +1183,7 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
   int RHSC;
   if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) {
     Base = N.getOperand(0);
-    OffImm =
-        CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32, /*isTarget=*/true);
+    OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32);
     return true;
   }
 
@@ -1248,8 +1245,7 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
         if (MFI.getObjectAlign(FI) >= Align(4)) {
           Base = CurDAG->getTargetFrameIndex(
               FI, TLI->getPointerTy(CurDAG->getDataLayout()));
-          OffImm = CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32,
-                                             /*isTarget=*/true);
+          OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32);
           return true;
         }
       }
@@ -1269,8 +1265,8 @@ bool ARMDAGToDAGISel::SelectTAddrModeImm7(SDValue N, SDValue &Base,
       Base = N.getOperand(0);
       if (N.getOpcode() == ISD::SUB)
         RHSC = -RHSC;
-      OffImm = CurDAG->getSignedConstant(RHSC * (1 << Shift), SDLoc(N),
-                                         MVT::i32, /*isTarget=*/true);
+      OffImm = CurDAG->getSignedTargetConstant(RHSC * (1 << Shift), SDLoc(N),
+                                               MVT::i32);
       return true;
     }
   }
@@ -1332,8 +1328,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
         Base = CurDAG->getTargetFrameIndex(
             FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
-      OffImm = CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32,
-                                         /*isTarget=*/true);
+      OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
     }
   }
@@ -1359,9 +1354,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base,
 
       if (N.getOpcode() == ISD::SUB)
         RHSC = -RHSC;
-      OffImm =
-          CurDAG->getSignedConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32,
-                                    /*isTarget=*/true);
+      OffImm = CurDAG->getSignedTargetConstant(RHSC * (1 << Shift), SDLoc(N),
+                                               MVT::i32);
       return true;
     }
   }
@@ -1391,8 +1385,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
         Base = CurDAG->getTargetFrameIndex(
             FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
-      OffImm = CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32,
-                                         /*isTarget=*/true);
+      OffImm = CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
     }
   }
@@ -1409,10 +1402,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
   int RHSC;
   if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits.
     OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
-                 ? CurDAG->getSignedConstant(RHSC, SDLoc(N), MVT::i32,
-                                             /*isTarget=*/true)
-                 : CurDAG->getSignedConstant(-RHSC, SDLoc(N), MVT::i32,
-                                             /*isTarget=*/true);
+                 ? CurDAG->getSignedTargetConstant(RHSC, SDLoc(N), MVT::i32)
+                 : CurDAG->getSignedTargetConstant(-RHSC, SDLoc(N), MVT::i32);
     return true;
   }
 
@@ -1435,8 +1426,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, SDValue &Base,
 
       if (N.getOpcode() == ISD::SUB)
         RHSC = -RHSC;
-      OffImm = CurDAG->getSignedConstant(RHSC * (1 << Shift), SDLoc(N),
-                                         MVT::i32, /*isTarget=*/true);
+      OffImm = CurDAG->getSignedTargetConstant(RHSC * (1 << Shift), SDLoc(N),
+                                               MVT::i32);
       return true;
     }
   }
@@ -1479,10 +1470,10 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
   // 7 bit constant, shifted by Shift.
   if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) {
     OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
-                 ? CurDAG->getSignedConstant(RHSC * (1 << Shift), SDLoc(N),
-                                             MVT::i32, /*isTarget=*/true)
-                 : CurDAG->getSignedConstant(-RHSC * (1 << Shift), SDLoc(N),
-                                             MVT::i32, /*isTarget=*/true);
+                 ? CurDAG->getSignedTargetConstant(RHSC * (1 << Shift),
+                                                   SDLoc(N), MVT::i32)
+                 : CurDAG->getSignedTargetConstant(-RHSC * (1 << Shift),
+                                                   SDLoc(N), MVT::i32);
     return true;
   }
   return false;
@@ -1492,8 +1483,7 @@ template <int Min, int Max>
 bool ARMDAGToDAGISel::SelectImmediateInRange(SDValue N, SDValue &OffImm) {
   int Val;
   if (isScaledConstantInRange(N, 1, Min, Max, Val)) {
-    OffImm =
-        CurDAG->getSignedConstant(Val, SDLoc(N), MVT::i32, /*isTarget=*/true);
+    OffImm = CurDAG->getSignedTargetConstant(Val, SDLoc(N), MVT::i32);
     return true;
   }
   return false;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 7fce91f97f361..6b290135c5bcb 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -149,6 +149,9 @@ MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
   cl::desc("Maximum interleave factor for MVE VLDn to generate."),
   cl::init(2));
 
+/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
+constexpr MVT FlagsVT = MVT::i32;
+
 // The APCS parameter registers.
 static const MCPhysReg GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
@@ -1730,14 +1733,14 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(ARMISD::ASRL)
     MAKE_CASE(ARMISD::LSRL)
     MAKE_CASE(ARMISD::LSLL)
-    MAKE_CASE(ARMISD::SRL_GLUE)
-    MAKE_CASE(ARMISD::SRA_GLUE)
+    MAKE_CASE(ARMISD::LSLS)
+    MAKE_CASE(ARMISD::LSRS1)
+    MAKE_CASE(ARMISD::ASRS1)
     MAKE_CASE(ARMISD::RRX)
     MAKE_CASE(ARMISD::ADDC)
     MAKE_CASE(ARMISD::ADDE)
     MAKE_CASE(ARMISD::SUBC)
     MAKE_CASE(ARMISD::SUBE)
-    MAKE_CASE(ARMISD::LSLS)
     MAKE_CASE(ARMISD::VMOVRRD)
     MAKE_CASE(ARMISD::VMOVDRR)
     MAKE_CASE(ARMISD::VMOVhr)
@@ -2970,8 +2973,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   Ops.push_back(Callee);
 
   if (isTailCall) {
-    Ops.push_back(
-        DAG.getSignedConstant(SPDiff, dl, MVT::i32, /*isTarget=*/true));
+    Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
   }
 
   // Add argument registers to the end of the list so that they are known live
@@ -4969,14 +4971,14 @@ SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
                                      SelectionDAG &DAG, const SDLoc &dl,
                                      bool Signaling) const {
   assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
-  SDValue Cmp;
+  SDValue Flags;
   if (!isFloatingPointZero(RHS))
-    Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
-                      dl, MVT::Glue, LHS, RHS);
+    Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
+                        LHS, RHS);
   else
-    Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
-                      dl, MVT::Glue, LHS);
-  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
+    Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
+                        FlagsVT, LHS);
+  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Flags);
 }
 
 /// duplicateCmp - Glue values can have only one use, so this function
@@ -4989,15 +4991,11 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
 
   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
-  Cmp = Cmp.getOperand(0);
-  Opc = Cmp.getOpcode();
-  if (Opc == ARMISD::CMPFP)
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
-  else {
-    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
-  }
-  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
+  SDValue Flags = Cmp.getOperand(0);
+  assert((Flags.getOpcode() == ARMISD::CMPFP ||
+          Flags.getOpcode() == ARMISD::CMPFPw0) &&
+         "unexpected operand of FMSTAT");
+  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Flags);
 }
 
 // This function returns three things: the arithmetic computation itself
@@ -6847,10 +6845,10 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
 
-  // First, build a SRA_GLUE/SRL_GLUE op, which shifts the top part by one and
-  // captures the result into a carry flag.
-  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_GLUE:ARMISD::SRA_GLUE;
-  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
+  // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
+  // captures the shifted out bit into a carry flag.
+  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
 
   // The low part is an ARMISD::RRX operand, which shifts the carry in.
   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
@@ -20615,8 +20613,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         }
         return;
     }
-    Result = DAG.getSignedConstant(CVal, SDLoc(Op), Op.getValueType(),
-                                   /*isTarget=*/true);
+    Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
     break;
   }
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 0e086f3340ccb..344a0ad91e517 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -101,15 +101,15 @@ class VectorType;
 
     BCC_i64,
 
-    SRL_GLUE, // V,Flag = srl_flag X -> srl X, 1 + save carry out.
-    SRA_GLUE, // V,Flag = sra_flag X -> sra X, 1 + save carry out.
-    RRX,      // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+    LSLS,  // Flag-setting shift left.
+    LSRS1, // Flag-setting logical shift right by one bit.
+    ASRS1, // Flag-setting arithmetic shift right by one bit.
+    RRX,   // Shift right one bit with carry in.
 
     ADDC, // Add with carry
     ADDE, // Add using carry
     SUBC, // Sub with carry
     SUBE, // Sub using carry
-    LSLS, // Shift left producing carry
 
     VMOVRRD, // double to two gprs.
     VMOVDRR, // Two gprs to double.
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index d24d4af36f0d8..db38b43279b86 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -14,6 +14,9 @@
 // ARM specific DAG Nodes.
 //
 
+/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
+defvar FlagsVT = i32;
+
 // Type profiles.
 def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
                                            SDTCisVT<1, i32> ]>;
@@ -77,6 +80,18 @@ def SDT_ARMMEMCPY  : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                           SDTCisVT<2, i32>, SDTCisVT<3, i32>,
                                           SDTCisVT<4, i32>]>;
 
+def SDTIntUnaryOpWithFlagsOut : SDTypeProfile<2, 1, [
+  SDTCisInt<0>,         // result
+  SDTCisVT<1, FlagsVT>, // out flags
+  SDTCisSameAs<2, 0>    // operand
+]>;
+
+def SDTIntUnaryOpWithFlagsIn : SDTypeProfile<1, 2, [
+  SDTCisInt<0>,        // result
+  SDTCisSameAs<1, 0>,  // operand
+  SDTCisVT<1, FlagsVT> // in flags
+]>;
+
 def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
                                             [SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<0, 3>,
@@ -191,9 +206,9 @@ def ARMasrl          : SDNode<"ARMISD::ASRL", SDT_ARMIntShiftParts, []>;
 def ARMlsrl          : SDNode<"ARMISD::LSRL", SDT_ARMIntShiftParts, []>;
 def ARMlsll          : SDNode<"ARMISD::LSLL", SDT_ARMIntShiftParts, []>;
 
-def ARMsrl_glue      : SDNode<"ARMISD::SRL_GLUE", SDTIntUnaryOp, [SDNPOutGlue]>;
-def ARMsra_glue      : SDNode<"ARMISD::SRA_GLUE", SDTIntUnaryOp, [SDNPOutGlue]>;
-def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInGlue ]>;
+def ARMlsrs1 : SDNode<"ARMISD::LSRS1", SDTIntUnaryOpWithFlagsOut>;
+def ARMasrs1 : SDNode<"ARMISD::ASRS1", SDTIntUnaryOpWithFlagsOut>;
+def ARMrrx   : SDNode<"ARMISD::RRX"  , SDTIntUnaryOpWithFlagsIn>;
 
 def ARMaddc          : SDNode<"ARMISD::ADDC",  SDTBinaryArithWithFlags,
                               [SDNPCommutative]>;
@@ -371,14 +386,14 @@ def ARMVCCElse : PatLeaf<(i32 2)>;
 
 // imm_neg_XFORM - Return the negation of an i32 immediate value.
 def imm_neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(-(int)N->getZExtValue(), SDLoc(N), MVT::i32,
-                                   /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(-(int)N->getZExtValue(), SDLoc(N),
+                                         MVT::i32);
 }]>;
 
 // imm_not_XFORM - Return the complement of a i32 immediate value.
 def imm_not_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32,
-                                   /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(~(int)N->getZExtValue(), SDLoc(N),
+                                         MVT::i32);
 }]>;
 def gi_imm_not_XFORM : GICustomOperandRenderer<"renderInvertedImm">,
   GISDNodeXFormEquiv<imm_not_XFORM>;
@@ -3730,20 +3745,17 @@ def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
       Requires<[IsARM, HasV6T2]>;
 
 let Uses = [CPSR] in
-def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
-                    [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
-                    Requires<[IsARM]>, Sched<[WriteALU]>;
-
-// These aren't really mov instructions, but we have to define them this way
-// due to glue operands.
+def RRX : PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
+                     [(set GPR:$Rd, (ARMrrx GPR:$Rm, CPSR))]>,
+          UnaryDP, Requires<[IsARM]>, Sched<[WriteALU]>;
 
 let Defs = [CPSR] in {
-def MOVsrl_glue : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
-                      [(set GPR:$dst, (ARMsrl_glue GPR:$src))]>, UnaryDP,
-                      Sched<[WriteALU]>, Requires<[IsARM]>;
-def MOVsra_glue : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
-                      [(set GPR:$dst, (ARMsra_glue GPR:$src))]>, UnaryDP,
-                      Sched<[WriteALU]>, Requires<[IsARM]>;
+  def LSRs1 : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                         [(set GPR:$dst, CPSR, (ARMlsrs1 GPR:$src))]>,
+              UnaryDP, Sched<[WriteALU]>, Requires<[IsARM]>;
+  def ASRs1 : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                         [(set GPR:$dst, CPSR, (ARMasrs1 GPR:$src))]>,
+              UnaryDP, Sched<[WriteALU]>, Requires<[IsARM]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index cb20aacb539ad..aa5c0a5889768 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -77,8 +77,8 @@ def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
 
 // t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value
 def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(-((int)N->getZExtValue()), SDLoc(N),
-                                   MVT::i32, /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(-((int)N->getZExtValue()), SDLoc(N),
+                                         MVT::i32);
 }]>;
 
 // so_imm_notSext_XFORM - Return a so_imm value packed into the format
@@ -2787,8 +2787,9 @@ def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
 
 let Uses = [CPSR] in {
 def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
-                   "rrx", "\t$Rd, $Rm",
-                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]>, Sched<[WriteALU]> {
+                      "rrx", "\t$Rd, $Rm",
+                      [(set rGPR:$Rd, (ARMrrx rGPR:$Rm, CPSR))]>,
+            Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2800,12 +2801,13 @@ def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
 }
 }
 
+// These differ from t2LSRri / t2ASRri in that they are flag-setting
+// and have a hardcoded shift amount = 1.
 let isCodeGenOnly = 1, Defs = [CPSR] in {
-def t2MOVsrl_glue : T2TwoRegShiftImm<
-                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
-                        "lsrs", ".w\t$Rd, $Rm, #1",
-                        [(set rGPR:$Rd, (ARMsrl_glue rGPR:$Rm))]>,
-                        Sched<[WriteALU]> {
+def t2LSRs1 : T2TwoRegShiftImm<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                               "lsrs", ".w\t$Rd, $Rm, #1",
+                               [(set rGPR:$Rd, CPSR, (ARMlsrs1 rGPR:$Rm))]>,
+              Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2816,11 +2818,10 @@ def t2MOVsrl_glue : T2TwoRegShiftImm<
   let Inst{14-12} = 0b000;
   let Inst{7-6} = 0b01;
 }
-def t2MOVsra_glue : T2TwoRegShiftImm<
-                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
-                        "asrs", ".w\t$Rd, $Rm, #1",
-                        [(set rGPR:$Rd, (ARMsra_glue rGPR:$Rm))]>,
-                        Sched<[WriteALU]> {
+def t2ASRs1 : T2TwoRegShiftImm<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                               "asrs", ".w\t$Rd, $Rm, #1",
+                               [(set rGPR:$Rd, CPSR, (ARMasrs1 rGPR:$Rm))]>,
+              Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 5b49f728ebb8d..a29753909ea99 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -10,7 +10,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_CMPFP : SDTypeProfile<1, 2, [
+  SDTCisVT<0, FlagsVT>, // out flags
+  SDTCisFP<1>,          // lhs
+  SDTCisSameAs<2, 1>    // rhs
+]>;
+
+def SDT_CMPFP0 : SDTypeProfile<1, 1, [
+  SDTCisVT<0, FlagsVT>, // out flags
+  SDTCisFP<1>           // operand
+]>;
+
 def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
                                        SDTCisSameAs<1, 2>]>;
 def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
@@ -18,11 +28,18 @@ def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
 
 def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
 
-def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
-def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
-def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
-def arm_cmpfpe : SDNode<"ARMISD::CMPFPE",  SDT_ARMCmp, [SDNPOutGlue]>;
-def arm_cmpfpe0: SDNode<"ARMISD::CMPFPEw0",SDT_CMPFP0, [SDNPOutGlue]>;
+def arm_cmpfp   : SDNode<"ARMISD::CMPFP",    SDT_CMPFP>;
+def arm_cmpfp0  : SDNode<"ARMISD::CMPFPw0",  SDT_CMPFP0>;
+def arm_cmpfpe  : SDNode<"ARMISD::CMPFPE",   SDT_CMPFP>;
+def arm_cmpfpe0 : SDNode<"ARMISD::CMPFPEw0", SDT_CMPFP0>;
+
+def arm_fmstat : SDNode<"ARMISD::FMSTAT",
+  SDTypeProfile<0, 1, [
+    SDTCisVT<0, FlagsVT> // in flags
+  ]>,
+  [SDNPOutGlue] // TODO: Change Glue to a normal result.
+>;
+
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
 def arm_fmrrd  : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
 def arm_vmovsr  : SDNode<"ARMISD::VMOVSR", SDT_VMOVSR>;
@@ -606,12 +623,12 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", "",
-                  [(arm_cmpfpe DPR:$Dd, (f64 DPR:$Dm))]>;
+                  [(set FPSCR_NZCV, (arm_cmpfpe DPR:$Dd, (f64 DPR:$Dm)))]>;
 
 def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm", "",
-                  [(arm_cmpfpe SPR:$Sd, SPR:$Sm)]> {
+                  [(set FPSCR_NZCV, (arm_cmpfpe SPR:$Sd, SPR:$Sm))]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -620,17 +637,17 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
 def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins HPR:$Sd, HPR:$Sm),
                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
-                  [(arm_cmpfpe (f16 HPR:$Sd), (f16 HPR:$Sm))]>;
+                  [(set FPSCR_NZCV, (arm_cmpfpe (f16 HPR:$Sd), (f16 HPR:$Sm)))]>;
 
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm", "",
-                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+                  [(set FPSCR_NZCV, (arm_cmpfp DPR:$Dd, (f64 DPR:$Dm)))]>;
 
 def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm", "",
-                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+                  [(set FPSCR_NZCV, (arm_cmpfp SPR:$Sd, SPR:$Sm))]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -639,7 +656,7 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
 def VCMPH  : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins HPR:$Sd, HPR:$Sm),
                   IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
-                  [(arm_cmpfp (f16 HPR:$Sd), (f16 HPR:$Sm))]>;
+                  [(set FPSCR_NZCV, (arm_cmpfp (f16 HPR:$Sd), (f16 HPR:$Sm)))]>;
 } // Defs = [FPSCR_NZCV]
 
 //===----------------------------------------------------------------------===//
@@ -669,7 +686,7 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", "",
-                   [(arm_cmpfpe0 (f64 DPR:$Dd))]> {
+                   [(set FPSCR_NZCV, (arm_cmpfpe0 (f64 DPR:$Dd)))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -677,7 +694,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0", "",
-                   [(arm_cmpfpe0 SPR:$Sd)]> {
+                   [(set FPSCR_NZCV, (arm_cmpfpe0 SPR:$Sd))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -689,7 +706,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins HPR:$Sd),
                    IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
-                   [(arm_cmpfpe0 (f16 HPR:$Sd))]> {
+                   [(set FPSCR_NZCV, (arm_cmpfpe0 (f16 HPR:$Sd)))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -697,7 +714,7 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0", "",
-                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+                   [(set FPSCR_NZCV, (arm_cmpfp0 (f64 DPR:$Dd)))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -705,7 +722,7 @@ def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0", "",
-                   [(arm_cmpfp0 SPR:$Sd)]> {
+                   [(set FPSCR_NZCV, (arm_cmpfp0 SPR:$Sd))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -717,7 +734,7 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins HPR:$Sd),
                    IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
-                   [(arm_cmpfp0 (f16 HPR:$Sd))]> {
+                   [(set FPSCR_NZCV, (arm_cmpfp0 (f16 HPR:$Sd)))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -2492,7 +2509,8 @@ let DecoderMethod = "DecodeForVMRSandVMSR" in {
  let Defs = [CPSR], Uses = [FPSCR_NZCV], Predicates = [HasFPRegs],
      Rt = 0b1111 /* apsr_nzcv */ in
  def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
-                         "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>;
+                         "vmrs", "\tAPSR_nzcv, fpscr",
+                         [(arm_fmstat FPSCR_NZCV)]>;
 
  // Application level FPSCR -> GPR
  let hasSideEffects = 1, Uses = [FPSCR], Predicates = [HasFPRegs] in
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 54bf5fffd3942..e330d83cd80d5 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -81,7 +81,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// Some may be spilled after the stack has been realigned.
   unsigned GPRCS1Offset = 0;
   unsigned GPRCS2Offset = 0;
-  unsigned DPRCSOffset = 0;
+  unsigned DPRCS1Offset = 0;
 
   /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
   /// areas.
@@ -90,7 +90,8 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   unsigned GPRCS1Size = 0;
   unsigned GPRCS2Size = 0;
   unsigned DPRCSAlignGapSize = 0;
-  unsigned DPRCSSize = 0;
+  unsigned DPRCS1Size = 0;
+  unsigned GPRCS3Size = 0;
 
   /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
   /// the aligned portion of the stack frame.  This is always a contiguous
@@ -194,25 +195,27 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
   unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
-  unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
+  unsigned getDPRCalleeSavedArea1Offset() const { return DPRCS1Offset; }
 
   void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
   void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
-  void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
+  void setDPRCalleeSavedArea1Offset(unsigned o) { DPRCS1Offset = o; }
 
   unsigned getFPCXTSaveAreaSize() const       { return FPCXTSaveSize; }
   unsigned getFrameRecordSavedAreaSize() const { return FRSaveSize; }
   unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
   unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
   unsigned getDPRCalleeSavedGapSize() const   { return DPRCSAlignGapSize; }
-  unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
+  unsigned getDPRCalleeSavedArea1Size() const { return DPRCS1Size; }
+  unsigned getGPRCalleeSavedArea3Size() const { return GPRCS3Size; }
 
   void setFPCXTSaveAreaSize(unsigned s)       { FPCXTSaveSize = s; }
   void setFrameRecordSavedAreaSize(unsigned s) { FRSaveSize = s; }
   void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
   void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
   void setDPRCalleeSavedGapSize(unsigned s)   { DPRCSAlignGapSize = s; }
-  void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
+  void setDPRCalleeSavedArea1Size(unsigned s) { DPRCS1Size = s; }
+  void setGPRCalleeSavedArea3Size(unsigned s) { GPRCS3Size = s; }
 
   unsigned getArgumentStackSize() const { return ArgumentStackSize; }
   void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index f37d0fe542b4f..f5a675e2976bb 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -413,7 +413,9 @@ def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1, v2i1], 32, (add VPR)> {
 
 // FPSCR, when the flags at the top of it are used as the input or
 // output to an instruction such as MVE VADC.
-def cl_FPSCR_NZCV : RegisterClass<"ARM", [i32], 32, (add FPSCR_NZCV)>;
+def cl_FPSCR_NZCV : RegisterClass<"ARM", [i32], 32, (add FPSCR_NZCV)> {
+  let CopyCost = -1;
+}
 
 // Scalar single precision floating point register class..
 // FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack
diff --git a/llvm/lib/Target/ARM/ARMScheduleM7.td b/llvm/lib/Target/ARM/ARMScheduleM7.td
index 25bc8401ca84a..99d2e4a832220 100644
--- a/llvm/lib/Target/ARM/ARMScheduleM7.td
+++ b/llvm/lib/Target/ARM/ARMScheduleM7.td
@@ -325,7 +325,7 @@ def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;
 def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
              (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
                         "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
-                        "t2MOVsr(a|l)")>;
+                        "t2(A|L)SRs1$")>;
 def : InstRW<[WriteALUsi, M7Read_ISS],
              (instregex "t2MVNs")>;
 
@@ -335,7 +335,7 @@ def : InstRW<[WriteALUsi, M7Read_ISS],
 // but the results prove to be better than trying to get them exact.
 
 def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
-def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;
+def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)r", "tROR")>;
 
 // Instructions that use the shifter, but have normal timing.
 
diff --git a/llvm/lib/Target/ARM/ARMScheduleM85.td b/llvm/lib/Target/ARM/ARMScheduleM85.td
index cd375a16305ec..e9938d857e6af 100644
--- a/llvm/lib/Target/ARM/ARMScheduleM85.td
+++ b/llvm/lib/Target/ARM/ARMScheduleM85.td
@@ -436,7 +436,7 @@ def : InstRW<[M85WriteALUsi, M85ReadALUsi],
 def : InstRW<[M85WriteShift2],
                (instregex "t2RRX$")>;
 def : InstRW<[WriteALU],
-               (instregex "(t|t2)(LSL|LSR|ASR|ROR|SBFX|UBFX)", "t2MOVsr(a|l)")>;
+             (instregex "(t|t2)(LSL|LSR|ASR|ROR|SBFX|UBFX)")>;
 
 // Instructions that use the shifter, but have normal timing
 
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 0dc637fc08aca..bf62849fba0c3 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -9053,10 +9053,11 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2LDR_PRE_imm ? ARM::t2LDR_PRE
                                                              : ARM::t2LDR_POST);
     TmpInst.addOperand(Inst.getOperand(0)); // Rt
-    TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // imm
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
     return true;
   }
@@ -9066,11 +9067,12 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     MCInst TmpInst;
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2STR_PRE_imm ? ARM::t2STR_PRE
                                                              : ARM::t2STR_POST);
-    TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb
     TmpInst.addOperand(Inst.getOperand(0)); // Rt
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // imm
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
     return true;
   }
@@ -9092,10 +9094,11 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
                           ? ARM::t2LDRB_PRE
                           : ARM::t2LDRB_POST);
     TmpInst.addOperand(Inst.getOperand(0)); // Rt
-    TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // imm
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
     return true;
   }
@@ -9116,11 +9119,12 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2STRB_PRE_imm
                           ? ARM::t2STRB_PRE
                           : ARM::t2STRB_POST);
-    TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb
     TmpInst.addOperand(Inst.getOperand(0)); // Rt
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // imm
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
     return true;
   }
@@ -9142,10 +9146,11 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
                           ? ARM::t2LDRH_PRE
                           : ARM::t2LDRH_POST);
     TmpInst.addOperand(Inst.getOperand(0)); // Rt
-    TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // imm
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
     return true;
   }
@@ -9166,11 +9171,12 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2STRH_PRE_imm
                           ? ARM::t2STRH_PRE
                           : ARM::t2STRH_POST);
-    TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb
     TmpInst.addOperand(Inst.getOperand(0)); // Rt
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // imm
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
     return true;
   }
@@ -9192,10 +9198,11 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
                           ? ARM::t2LDRSB_PRE
                           : ARM::t2LDRSB_POST);
     TmpInst.addOperand(Inst.getOperand(0)); // Rt
-    TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // imm
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
     return true;
   }
@@ -9217,10 +9224,11 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
                           ? ARM::t2LDRSH_PRE
                           : ARM::t2LDRSH_POST);
     TmpInst.addOperand(Inst.getOperand(0)); // Rt
-    TmpInst.addOperand(Inst.getOperand(4)); // Rt_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // imm
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
     return true;
   }
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 0a6bb5d1b9b16..890c2344c2431 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -290,7 +290,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     AFI->setFrameRecordSavedAreaSize(FRSize);
   AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
   AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
-  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+  AFI->setDPRCalleeSavedArea1Offset(DPRCSOffset);
   NumBytes = DPRCSOffset;
 
   int FramePtrOffsetInBlock = 0;
@@ -440,7 +440,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
 
   AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
-  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+  AFI->setDPRCalleeSavedArea1Size(DPRCSSize);
 
   if (RegInfo->hasStackRealignment(MF)) {
     const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
@@ -526,11 +526,10 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
     }
 
     // Move SP to start of FP callee save spill area.
-    NumBytes -= (AFI->getFrameRecordSavedAreaSize() +
-                 AFI->getGPRCalleeSavedArea1Size() +
-                 AFI->getGPRCalleeSavedArea2Size() +
-                 AFI->getDPRCalleeSavedAreaSize() +
-                 ArgRegsSaveSize);
+    NumBytes -=
+        (AFI->getFrameRecordSavedAreaSize() +
+         AFI->getGPRCalleeSavedArea1Size() + AFI->getGPRCalleeSavedArea2Size() +
+         AFI->getDPRCalleeSavedArea1Size() + ArgRegsSaveSize);
 
     // We are likely to need a scratch register and we know all callee-save
     // registers are free at this point in the epilogue, so pick one.
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index dec3a9b4a8264..e4a3bc76eeacd 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -402,7 +402,7 @@ static bool flattenArrays(Module &M) {
   DenseMap<GlobalVariable *, GlobalVariable *> GlobalMap;
   flattenGlobalArrays(M, GlobalMap);
   for (auto &F : make_early_inc_range(M.functions())) {
-    if (F.isIntrinsic())
+    if (F.isDeclaration())
       continue;
     MadeChange |= Impl.visit(F);
   }
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index b4b19caed8999..96eb8b1b0528a 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/LoongArchBaseInfo.h"
 #include "MCTargetDesc/LoongArchInstPrinter.h"
 #include "MCTargetDesc/LoongArchMCExpr.h"
 #include "MCTargetDesc/LoongArchMCTargetDesc.h"
@@ -1560,12 +1561,14 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
 
 unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &MCID = MII.get(Opc);
   switch (Opc) {
   default:
-    if (Opc >= LoongArch::AMADD_D && Opc <= LoongArch::AMXOR_W) {
+    if (LoongArchII::isSubjectToAMORdConstraint(MCID.TSFlags)) {
+      const bool IsAMCAS = LoongArchII::isAMCAS(MCID.TSFlags);
       MCRegister Rd = Inst.getOperand(0).getReg();
-      MCRegister Rk = Inst.getOperand(1).getReg();
-      MCRegister Rj = Inst.getOperand(2).getReg();
+      MCRegister Rk = Inst.getOperand(IsAMCAS ? 2 : 1).getReg();
+      MCRegister Rj = Inst.getOperand(IsAMCAS ? 3 : 2).getReg();
       if ((Rd == Rk || Rd == Rj) && Rd != LoongArch::R0)
         return Match_RequiresAMORdDifferRkRj;
     }
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
index 6ffc8823baee0..eee297d2e2d91 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
@@ -32,6 +32,14 @@ class LAInst<dag outs, dag ins, string opcstr, string opnstr,
   let InOperandList = ins;
   let AsmString = opcstr # "\t" # opnstr;
   let Pattern = pattern;
+
+  // Target-specific instruction info and defaults
+
+  bit IsSubjectToAMORdConstraint = 0;
+  let TSFlags{0} = IsSubjectToAMORdConstraint;
+
+  bit IsAMCAS = 0;
+  let TSFlags{1} = IsAMCAS;
 }
 
 // Pseudo instructions
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 3de20d6e599db..cd1500229f4aa 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -710,10 +710,22 @@ class STORE_2RI14<bits<32> op>
                "$rd, $rj, $imm14">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 1
 
-let hasSideEffects = 0, mayLoad = 1, mayStore = 1, Constraints = "@earlyclobber $rd" in
+let hasSideEffects = 0, mayLoad = 1, mayStore = 1,
+    IsSubjectToAMORdConstraint = 1 in {
 class AM_3R<bits<32> op>
     : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rk, GPRMemAtomic:$rj),
-            "$rd, $rk, $rj">;
+            "$rd, $rk, $rj"> {
+  let Constraints = "@earlyclobber $rd";
+}
+
+class AMCAS_3R<bits<32> op>
+    : Fmt3R<op, (outs GPR:$rd_wb), (ins GPR:$rd, GPR:$rk, GPRMemAtomic:$rj),
+            "$rd, $rk, $rj"> {
+  let Constraints = "@earlyclobber $rd_wb, $rd_wb = $rd";
+  let IsAMCAS = 1;
+}
+} // hasSideEffects = 0, mayLoad = 1, mayStore = 1,
+  // IsSubjectToAMORdConstraint = 1
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
 class LLBase<bits<32> op>
@@ -1024,14 +1036,14 @@ def AMMAX__DB_WU : AM_3R<0x38700000>;
 def AMMAX__DB_DU : AM_3R<0x38708000>;
 def AMMIN__DB_WU : AM_3R<0x38710000>;
 def AMMIN__DB_DU : AM_3R<0x38718000>;
-def AMCAS_B     : AM_3R<0x38580000>;
-def AMCAS_H     : AM_3R<0x38588000>;
-def AMCAS_W     : AM_3R<0x38590000>;
-def AMCAS_D     : AM_3R<0x38598000>;
-def AMCAS__DB_B     : AM_3R<0x385a0000>;
-def AMCAS__DB_H     : AM_3R<0x385a8000>;
-def AMCAS__DB_W     : AM_3R<0x385b0000>;
-def AMCAS__DB_D     : AM_3R<0x385b8000>;
+def AMCAS_B      : AMCAS_3R<0x38580000>;
+def AMCAS_H      : AMCAS_3R<0x38588000>;
+def AMCAS_W      : AMCAS_3R<0x38590000>;
+def AMCAS_D      : AMCAS_3R<0x38598000>;
+def AMCAS__DB_B  : AMCAS_3R<0x385a0000>;
+def AMCAS__DB_H  : AMCAS_3R<0x385a8000>;
+def AMCAS__DB_W  : AMCAS_3R<0x385b0000>;
+def AMCAS__DB_D  : AMCAS_3R<0x385b8000>;
 def LL_D : LLBase<0x22000000>;
 def SC_D : SCBase<0x23000000>;
 def SC_Q : SCBase_128<0x38570000>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 250896cbbe5f7..1a267b3e42a30 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -74,8 +74,7 @@ class VecCond<SDPatternOperator OpNode, ValueType TyNode,
   let usesCustomInserter = 1;
 }
 
-def vsplat_imm_eq_1 : PatFrags<(ops), [(build_vector),
-                                       (bitconvert (v4i32 (build_vector)))], [{
+def vsplat_imm_eq_1 : PatFrags<(ops), [(build_vector)], [{
   APInt Imm;
   EVT EltTy = N->getValueType(0).getVectorElementType();
 
@@ -116,8 +115,7 @@ def vsplati32_imm_eq_31 : PatFrags<(ops), [(build_vector)], [{
   return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
          Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 31;
 }]>;
-def vsplati64_imm_eq_63 : PatFrags<(ops), [(build_vector),
-                                           (bitconvert (v4i32 (build_vector)))], [{
+def vsplati64_imm_eq_63 : PatFrags<(ops), [(build_vector)], [{
   APInt Imm;
   EVT EltTy = N->getValueType(0).getVectorElementType();
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index 1a12fb492a60f..bd63c5edeabca 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -56,6 +56,36 @@ enum {
   MO_DESC_CALL,
   // TODO: Add more flags.
 };
+
+// Target-specific flags of LAInst.
+// All definitions must match LoongArchInstrFormats.td.
+enum {
+  // Whether the instruction's rd is normally required to differ from rj and
+  // rk, in the way the 3-register atomic memory operations behave
+  // (Section 2.2.7.1 and 2.2.7.2, LoongArch Reference Manual Volume 1 v1.10;
+  // while Section 2.2.7.3 lacked similar description for the AMCAS
+  // instructions, at least the INE exception is still signaled on Loongson
+  // 3A6000 when its rd == rj).
+  //
+  // Used for generating diagnostics for assembler input that violate the
+  // constraint. As described on the manual, the covered instructions require
+  // rd != rj && rd != rk to work as intended.
+  IsSubjectToAMORdConstraintShift = 0,
+  IsSubjectToAMORdConstraintMask = 1 << IsSubjectToAMORdConstraintShift,
+
+  // Whether the instruction belongs to the AMCAS family.
+  IsAMCASShift = IsSubjectToAMORdConstraintShift + 1,
+  IsAMCASMask = 1 << IsAMCASShift,
+};
+
+/// \returns true if this instruction's rd is normally required to differ
+/// from rj and rk, in the way 3-register atomic memory operations behave.
+static inline bool isSubjectToAMORdConstraint(uint64_t TSFlags) {
+  return TSFlags & IsSubjectToAMORdConstraintMask;
+}
+
+/// \returns true if this instruction belongs to the AMCAS family.
+static inline bool isAMCAS(uint64_t TSFlags) { return TSFlags & IsAMCASMask; }
 } // end namespace LoongArchII
 
 namespace LoongArchABI {
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 866161bf50638..25384a3fe8de3 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -20,8 +20,8 @@ class CCIfSubtargetNot<string F, CCAction A> : CCIfSubtarget<F, A, "!">;
 
 /// Match if the original argument (before lowering) was a float.
 /// For example, this is true for i32's that were lowered from soft-float.
-class CCIfOrigArgWasNotFloat<CCAction A>
-    : CCIf<"!static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
+class CCIfOrigArgWasFloat<CCAction A>
+    : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
            A>;
 
 /// Match if the original argument (before lowering) was a 128-bit float (i.e.
@@ -134,12 +134,13 @@ def CC_MipsN : CallingConv<[
       CCIfSubtargetNot<"isLittle()",
           CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
 
-  // All integers (except soft-float integers) are promoted to 64-bit.
-  CCIfType<[i8, i16, i32], CCIfOrigArgWasNotFloat<CCPromoteToType<i64>>>,
+  // First deal with f32s that got turned into i32s due to soft-float
+  CCIfType<[i32],
+      CCIfSubtarget<"useSoftFloat()",
+          CCIfOrigArgWasFloat<CCDelegateTo<CC_MipsN_SoftFloat>>>>,
 
-  // The only i32's we have left are soft-float arguments.
-  CCIfSubtarget<"useSoftFloat()", CCIfType<[i32],
-                CCDelegateTo<CC_MipsN_SoftFloat>>>,
+  // All remaining integers are promoted to 64-bit.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
 
   // Integer arguments are passed in integer registers.
   CCIfType<[i64], CCAssignToRegWithShadow<[A0_64, A1_64, A2_64, A3_64,
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index afb027a533d5a..c3e21e0ff7a0f 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -614,11 +614,9 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
 
   if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
       ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
-    // Extract the run of set bits starting with bit zero from the bitwise
-    // inverse of ImmValue, and test that the inverse of this is the same
-    // as the original value.
-    if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
-
+    // Check if we have a leading one, then check if the whole value is a
+    // shifted mask.
+    if (ImmValue.isNegative() && ImmValue.isShiftedMask()) {
       Imm = CurDAG->getTargetConstant(ImmValue.popcount() - 1, SDLoc(N), EltTy);
       return true;
     }
@@ -647,9 +645,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
 
   if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
       ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
-    // Extract the run of set bits starting with bit zero, and test that the
-    // result is the same as the original value
-    if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
+    if (ImmValue.isMask()) {
       Imm = CurDAG->getTargetConstant(ImmValue.popcount() - 1, SDLoc(N), EltTy);
       return true;
     }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 4ad0200ca5cf8..e93430a27dc32 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -862,16 +862,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setOperationAction(Op, MVT::bf16, Promote);
     AddPromotedToType(Op, MVT::bf16, MVT::f32);
   }
-  for (const auto &Op : {ISD::FABS}) {
-    setOperationAction(Op, MVT::f16, Promote);
-    setOperationAction(Op, MVT::f32, Legal);
-    setOperationAction(Op, MVT::f64, Legal);
-    setOperationAction(Op, MVT::v2f16, Expand);
-    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
-    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
-    if (getOperationAction(Op, MVT::bf16) == Promote)
-      AddPromotedToType(Op, MVT::bf16, MVT::f32);
+
+  setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
+  if (STI.getPTXVersion() >= 65) {
+    setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
+    setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
+  } else {
+    setOperationAction(ISD::FABS, MVT::f16, Promote);
+    setOperationAction(ISD::FABS, MVT::v2f16, Expand);
   }
+  setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
+  setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
+  if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
+    AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
 
   for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
     setOperationAction(Op, MVT::f32, Legal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 4661c059d5f78..b4dbe6a0930ca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -199,3 +199,23 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
   BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
   return 2;
 }
+
+bool NVPTXInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                          const MachineBasicBlock *MBB,
+                                          const MachineFunction &MF) const {
+  // Prevent the scheduler from reordering & splitting up MachineInstrs
+  // which must stick together (in initially set order) to
+  // comprise a valid PTX function call sequence.
+  switch (MI.getOpcode()) {
+  case NVPTX::CallUniPrintCallRetInst1:
+  case NVPTX::CallArgBeginInst:
+  case NVPTX::CallArgI32imm:
+  case NVPTX::CallArgParam:
+  case NVPTX::LastCallArgI32imm:
+  case NVPTX::LastCallArgParam:
+  case NVPTX::CallArgEndInst1:
+    return true;
+  }
+
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index f674a00bc351b..a1d9f01712018 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -67,6 +67,9 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4d46afb8c4ef9..b843bb5ae4310 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -481,6 +481,12 @@ struct RISCVOperand final : public MCParsedAsmOperand {
            RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum);
   }
 
+  bool isGPRPair() const {
+    return Kind == KindTy::Register &&
+           RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains(
+               Reg.RegNum);
+  }
+
   bool isGPRF16() const {
     return Kind == KindTy::Register &&
            RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.RegNum);
@@ -494,13 +500,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
   bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
   bool isGPRAsFPR16() const { return isGPRF16() && Reg.IsGPRAsFPR; }
   bool isGPRAsFPR32() const { return isGPRF32() && Reg.IsGPRAsFPR; }
-  bool isGPRPairAsFPR() const { return isGPRPair() && Reg.IsGPRAsFPR; }
-
-  bool isGPRPair() const {
-    return Kind == KindTy::Register &&
-           RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains(
-               Reg.RegNum);
-  }
+  bool isGPRPairAsFPR64() const { return isGPRPair() && Reg.IsGPRAsFPR; }
 
   static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
                                   RISCVMCExpr::VariantKind &VK) {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 3c95f01b86361..6ddc447da1a56 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -581,8 +581,6 @@ static void getOperandsForBranch(Register CondReg, RISCVCC::CondCode &CC,
 }
 
 bool RISCVInstructionSelector::select(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
   MachineIRBuilder MIB(MI);
 
   preISelLower(MI, MIB);
@@ -703,58 +701,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
     MI.eraseFromParent();
     return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
   }
-  case TargetOpcode::G_BRJT: {
-    // FIXME: Move to legalization?
-    const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
-    unsigned EntrySize = MJTI->getEntrySize(MF.getDataLayout());
-    assert((EntrySize == 4 || (Subtarget->is64Bit() && EntrySize == 8)) &&
-           "Unsupported jump-table entry size");
-    assert(
-        (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 ||
-         MJTI->getEntryKind() == MachineJumpTableInfo::EK_Custom32 ||
-         MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress) &&
-        "Unexpected jump-table entry kind");
-
-    auto SLL =
-        MIB.buildInstr(RISCV::SLLI, {&RISCV::GPRRegClass}, {MI.getOperand(2)})
-            .addImm(Log2_32(EntrySize));
-    if (!SLL.constrainAllUses(TII, TRI, RBI))
-      return false;
-
-    // TODO: Use SHXADD. Moving to legalization would fix this automatically.
-    auto ADD = MIB.buildInstr(RISCV::ADD, {&RISCV::GPRRegClass},
-                              {MI.getOperand(0), SLL.getReg(0)});
-    if (!ADD.constrainAllUses(TII, TRI, RBI))
-      return false;
-
-    unsigned LdOpc = EntrySize == 8 ? RISCV::LD : RISCV::LW;
-    auto Dest =
-        MIB.buildInstr(LdOpc, {&RISCV::GPRRegClass}, {ADD.getReg(0)})
-            .addImm(0)
-            .addMemOperand(MF.getMachineMemOperand(
-                MachinePointerInfo::getJumpTable(MF), MachineMemOperand::MOLoad,
-                EntrySize, Align(MJTI->getEntryAlignment(MF.getDataLayout()))));
-    if (!Dest.constrainAllUses(TII, TRI, RBI))
-      return false;
-
-    // If the Kind is EK_LabelDifference32, the table stores an offset from
-    // the location of the table. Add the table address to get an absolute
-    // address.
-    if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32) {
-      Dest = MIB.buildInstr(RISCV::ADD, {&RISCV::GPRRegClass},
-                            {Dest.getReg(0), MI.getOperand(0)});
-      if (!Dest.constrainAllUses(TII, TRI, RBI))
-        return false;
-    }
-
-    auto Branch =
-        MIB.buildInstr(RISCV::PseudoBRIND, {}, {Dest.getReg(0)}).addImm(0);
-    if (!Branch.constrainAllUses(TII, TRI, RBI))
-      return false;
-
-    MI.eraseFromParent();
-    return true;
-  }
   case TargetOpcode::G_BRINDIRECT:
     MI.setDesc(TII.get(RISCV::PseudoBRIND));
     MI.addOperand(MachineOperand::CreateImm(0));
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index f8a13433a1484..b557659ae0765 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -282,14 +283,16 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   };
 
   LoadActions.legalForTypesWithMemDesc(
-      {{s16, p0, s16, getScalarMemAlign(16)},
+      {{s16, p0, s8, getScalarMemAlign(8)},
        {s32, p0, s8, getScalarMemAlign(8)},
+       {s16, p0, s16, getScalarMemAlign(16)},
        {s32, p0, s16, getScalarMemAlign(16)},
        {s32, p0, s32, getScalarMemAlign(32)},
        {p0, p0, sXLen, getScalarMemAlign(XLen)}});
   StoreActions.legalForTypesWithMemDesc(
-      {{s16, p0, s16, getScalarMemAlign(16)},
+      {{s16, p0, s8, getScalarMemAlign(8)},
        {s32, p0, s8, getScalarMemAlign(8)},
+       {s16, p0, s16, getScalarMemAlign(16)},
        {s32, p0, s16, getScalarMemAlign(16)},
        {s32, p0, s32, getScalarMemAlign(32)},
        {p0, p0, sXLen, getScalarMemAlign(XLen)}});
@@ -383,10 +386,10 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
 
   LoadActions.widenScalarToNextPow2(0, /* MinSize = */ 8)
       .lowerIfMemSizeNotByteSizePow2()
-      .clampScalar(0, s32, sXLen)
+      .clampScalar(0, s16, sXLen)
       .lower();
   StoreActions
-      .clampScalar(0, s32, sXLen)
+      .clampScalar(0, s16, sXLen)
       .lowerIfMemSizeNotByteSizePow2()
       .lower();
 
@@ -404,7 +407,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
 
   getActionDefinitionsBuilder(G_BRCOND).legalFor({sXLen}).minScalar(0, sXLen);
 
-  getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, sXLen}});
+  getActionDefinitionsBuilder(G_BRJT).customFor({{p0, sXLen}});
 
   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
 
@@ -685,6 +688,61 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,
   return true;
 }
 
+bool RISCVLegalizerInfo::legalizeBRJT(MachineInstr &MI,
+                                      MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  auto &MF = *MI.getParent()->getParent();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  unsigned EntrySize = MJTI->getEntrySize(MF.getDataLayout());
+
+  Register PtrReg = MI.getOperand(0).getReg();
+  LLT PtrTy = MRI.getType(PtrReg);
+  Register IndexReg = MI.getOperand(2).getReg();
+  LLT IndexTy = MRI.getType(IndexReg);
+
+  if (!isPowerOf2_32(EntrySize))
+    return false;
+
+  auto ShiftAmt = MIRBuilder.buildConstant(IndexTy, Log2_32(EntrySize));
+  IndexReg = MIRBuilder.buildShl(IndexTy, IndexReg, ShiftAmt).getReg(0);
+
+  auto Addr = MIRBuilder.buildPtrAdd(PtrTy, PtrReg, IndexReg);
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getJumpTable(MF), MachineMemOperand::MOLoad,
+      EntrySize, Align(MJTI->getEntryAlignment(MF.getDataLayout())));
+
+  Register TargetReg;
+  switch (MJTI->getEntryKind()) {
+  default:
+    return false;
+  case MachineJumpTableInfo::EK_LabelDifference32: {
+    // For PIC, the sequence is:
+    // BRIND(load(Jumptable + index) + RelocBase)
+    // RelocBase can be JumpTable, GOT or some sort of global base.
+    unsigned LoadOpc =
+        STI.is64Bit() ? TargetOpcode::G_SEXTLOAD : TargetOpcode::G_LOAD;
+    auto Load = MIRBuilder.buildLoadInstr(LoadOpc, IndexTy, Addr, *MMO);
+    TargetReg = MIRBuilder.buildPtrAdd(PtrTy, PtrReg, Load).getReg(0);
+    break;
+  }
+  case MachineJumpTableInfo::EK_Custom32: {
+    auto Load = MIRBuilder.buildLoadInstr(TargetOpcode::G_SEXTLOAD, IndexTy,
+                                          Addr, *MMO);
+    TargetReg = MIRBuilder.buildIntToPtr(PtrTy, Load).getReg(0);
+    break;
+  }
+  case MachineJumpTableInfo::EK_BlockAddress:
+    TargetReg = MIRBuilder.buildLoad(PtrTy, Addr, *MMO).getReg(0);
+    break;
+  }
+
+  MIRBuilder.buildBrIndirect(TargetReg);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool RISCVLegalizerInfo::shouldBeInConstantPool(const APInt &APImm,
                                                 bool ShouldOptForSize) const {
   assert(APImm.getBitWidth() == 32 || APImm.getBitWidth() == 64);
@@ -1311,6 +1369,8 @@ bool RISCVLegalizerInfo::legalizeCustom(
     MI.eraseFromParent();
     return true;
   }
+  case TargetOpcode::G_BRJT:
+    return legalizeBRJT(MI, MIRBuilder);
   case TargetOpcode::G_VASTART:
     return legalizeVAStart(MI, MIRBuilder);
   case TargetOpcode::G_VSCALE:
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index ae595cf6d737f..4451866745194 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -42,6 +42,7 @@ class RISCVLegalizerInfo : public LegalizerInfo {
   bool legalizeShlAshrLshr(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
                            GISelChangeObserver &Observer) const;
 
+  bool legalizeBRJT(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
   bool legalizeVAStart(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
   bool legalizeVScale(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool legalizeExt(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 8d1196c4765d4..9fd4400b97b23 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -174,6 +174,9 @@ def : StPat<store, SD, GPR, PtrVT>;
 def : LdPat<load, LH, i16>;
 def : StPat<store, SH, GPR, i16>;
 
+def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb.
+def : StPat<truncstorei8, SB, GPR, i16>;
+
 //===----------------------------------------------------------------------===//
 // RV64 i32 patterns not used by SelectionDAG
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index a1b74faf17fab..c5432619a3646 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -176,8 +176,7 @@ static SDValue selectImmSeq(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
                             RISCVMatInt::InstSeq &Seq) {
   SDValue SrcReg = CurDAG->getRegister(RISCV::X0, VT);
   for (const RISCVMatInt::Inst &Inst : Seq) {
-    SDValue SDImm =
-        CurDAG->getSignedConstant(Inst.getImm(), DL, VT, /*isTarget=*/true);
+    SDValue SDImm = CurDAG->getSignedTargetConstant(Inst.getImm(), DL, VT);
     SDNode *Result = nullptr;
     switch (Inst.getOpndKind()) {
     case RISCVMatInt::Imm:
@@ -208,10 +207,10 @@ static SDValue selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
 
   // Use a rematerializable pseudo instruction for short sequences if enabled.
   if (Seq.size() == 2 && UsePseudoMovImm)
-    return SDValue(CurDAG->getMachineNode(RISCV::PseudoMovImm, DL, VT,
-                                          CurDAG->getSignedConstant(
-                                              Imm, DL, VT, /*isTarget=*/true)),
-                   0);
+    return SDValue(
+        CurDAG->getMachineNode(RISCV::PseudoMovImm, DL, VT,
+                               CurDAG->getSignedTargetConstant(Imm, DL, VT)),
+        0);
 
   // See if we can create this constant as (ADD (SLLI X, C), X) where X is at
   // worst an LUI+ADDIW. This will require an extra register, but avoids a
@@ -594,7 +593,7 @@ bool RISCVDAGToDAGISel::tryShrinkShlLogicImm(SDNode *Node) {
 
   SDNode *BinOp = CurDAG->getMachineNode(
       BinOpc, DL, VT, Shift.getOperand(0),
-      CurDAG->getSignedConstant(ShiftedVal, DL, VT, /*isTarget=*/true));
+      CurDAG->getSignedTargetConstant(ShiftedVal, DL, VT));
   SDNode *SLLI =
       CurDAG->getMachineNode(ShOpc, DL, VT, SDValue(BinOp, 0),
                              CurDAG->getTargetConstant(ShAmt, DL, VT));
@@ -723,11 +722,10 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) {
     return false;
 
   EVT Ty = Ld->getOffset().getValueType();
-  SDValue Ops[] = {Ld->getBasePtr(),
-                   CurDAG->getSignedConstant(Offset >> Shift, SDLoc(Node), Ty,
-                                             /*isTarget=*/true),
-                   CurDAG->getTargetConstant(Shift, SDLoc(Node), Ty),
-                   Ld->getChain()};
+  SDValue Ops[] = {
+      Ld->getBasePtr(),
+      CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(Node), Ty),
+      CurDAG->getTargetConstant(Shift, SDLoc(Node), Ty), Ld->getChain()};
   SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(Node), Ld->getValueType(0),
                                        Ld->getValueType(1), MVT::Other, Ops);
 
@@ -952,11 +950,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     ReplaceNode(Node, Res);
     return;
   }
+  case RISCVISD::BuildGPRPair:
   case RISCVISD::BuildPairF64: {
-    if (!Subtarget->hasStdExtZdinx())
+    if (Opcode == RISCVISD::BuildPairF64 && !Subtarget->hasStdExtZdinx())
       break;
 
-    assert(!Subtarget->is64Bit() && "Unexpected subtarget");
+    assert((!Subtarget->is64Bit() || Opcode == RISCVISD::BuildGPRPair) &&
+           "BuildPairF64 only handled here on rv32i_zdinx");
 
     SDValue Ops[] = {
         CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32),
@@ -965,24 +965,26 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         Node->getOperand(1),
         CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)};
 
-    SDNode *N =
-        CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::f64, Ops);
+    SDNode *N = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
     ReplaceNode(Node, N);
     return;
   }
+  case RISCVISD::SplitGPRPair:
   case RISCVISD::SplitF64: {
-    if (Subtarget->hasStdExtZdinx()) {
-      assert(!Subtarget->is64Bit() && "Unexpected subtarget");
+    if (Subtarget->hasStdExtZdinx() || Opcode != RISCVISD::SplitF64) {
+      assert((!Subtarget->is64Bit() || Opcode == RISCVISD::SplitGPRPair) &&
+             "SplitF64 only handled here on rv32i_zdinx");
 
       if (!SDValue(Node, 0).use_empty()) {
-        SDValue Lo = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_even, DL, VT,
+        SDValue Lo = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_even, DL,
+                                                    Node->getValueType(0),
                                                     Node->getOperand(0));
         ReplaceUses(SDValue(Node, 0), Lo);
       }
 
       if (!SDValue(Node, 1).use_empty()) {
-        SDValue Hi = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_odd, DL, VT,
-                                                    Node->getOperand(0));
+        SDValue Hi = CurDAG->getTargetExtractSubreg(
+            RISCV::sub_gpr_odd, DL, Node->getValueType(1), Node->getOperand(0));
         ReplaceUses(SDValue(Node, 1), Hi);
       }
 
@@ -990,6 +992,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       return;
     }
 
+    assert(Opcode != RISCVISD::SplitGPRPair &&
+           "SplitGPRPair should already be handled");
+
     if (!Subtarget->hasStdExtZfa())
       break;
     assert(Subtarget->hasStdExtD() && !Subtarget->is64Bit() &&
@@ -2503,8 +2508,8 @@ bool RISCVDAGToDAGISel::SelectFrameAddrRegImm(SDValue Addr, SDValue &Base,
     if (isInt<12>(CVal)) {
       Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
                                          Subtarget->getXLenVT());
-      Offset = CurDAG->getSignedConstant(
-          CVal, SDLoc(Addr), Subtarget->getXLenVT(), /*isTarget=*/true);
+      Offset = CurDAG->getSignedTargetConstant(CVal, SDLoc(Addr),
+                                               Subtarget->getXLenVT());
       return true;
     }
   }
@@ -2543,7 +2548,7 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL,
     } else {
       Base = CurDAG->getRegister(RISCV::X0, VT);
     }
-    Offset = CurDAG->getSignedConstant(Lo12, DL, VT, /*isTarget=*/true);
+    Offset = CurDAG->getSignedTargetConstant(Lo12, DL, VT);
     return true;
   }
 
@@ -2565,7 +2570,7 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL,
   assert(!Seq.empty() && "Expected more instructions in sequence");
 
   Base = selectImmSeq(CurDAG, DL, VT, Seq);
-  Offset = CurDAG->getSignedConstant(Lo12, DL, VT, /*isTarget=*/true);
+  Offset = CurDAG->getSignedTargetConstant(Lo12, DL, VT);
   return true;
 }
 
@@ -2715,7 +2720,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
 
       if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
-      Offset = CurDAG->getSignedConstant(CVal, DL, VT, /*isTarget=*/true);
+      Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
       return true;
     }
   }
@@ -2732,11 +2737,10 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
     if (CVal >= -4096 && CVal <= (4094 - RV32ZdinxRange)) {
       int64_t Adj = CVal < 0 ? -2048 : 2047;
       Base = SDValue(
-          CurDAG->getMachineNode(
-              RISCV::ADDI, DL, VT, Addr.getOperand(0),
-              CurDAG->getSignedConstant(Adj, DL, VT, /*isTarget=*/true)),
+          CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0),
+                                 CurDAG->getSignedTargetConstant(Adj, DL, VT)),
           0);
-      Offset = CurDAG->getSignedConstant(CVal - Adj, DL, VT, /*isTarget=*/true);
+      Offset = CurDAG->getSignedTargetConstant(CVal - Adj, DL, VT);
       return true;
     }
 
@@ -2790,7 +2794,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
 
       if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
-      Offset = CurDAG->getSignedConstant(CVal, DL, VT, /*isTarget=*/true);
+      Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
       return true;
     }
   }
@@ -2806,12 +2810,12 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
     if ((-2049 >= CVal && CVal >= -4096) || (4065 >= CVal && CVal >= 2017)) {
       int64_t Adj = CVal < 0 ? -2048 : 2016;
       int64_t AdjustedOffset = CVal - Adj;
-      Base = SDValue(CurDAG->getMachineNode(
-                         RISCV::ADDI, DL, VT, Addr.getOperand(0),
-                         CurDAG->getSignedConstant(AdjustedOffset, DL, VT,
-                                                   /*isTarget=*/true)),
-                     0);
-      Offset = CurDAG->getSignedConstant(Adj, DL, VT, /*isTarget=*/true);
+      Base =
+          SDValue(CurDAG->getMachineNode(
+                      RISCV::ADDI, DL, VT, Addr.getOperand(0),
+                      CurDAG->getSignedTargetConstant(AdjustedOffset, DL, VT)),
+                  0);
+      Offset = CurDAG->getSignedTargetConstant(Adj, DL, VT);
       return true;
     }
 
@@ -2957,21 +2961,21 @@ bool RISCVDAGToDAGISel::selectSETCC(SDValue N, ISD::CondCode ExpectedCCVal,
     // If the RHS is -2048, we can use xori to produce 0 if the LHS is -2048 and
     // non-zero otherwise.
     if (CVal == -2048) {
-      Val = SDValue(CurDAG->getMachineNode(
-                        RISCV::XORI, DL, N->getValueType(0), LHS,
-                        CurDAG->getSignedConstant(CVal, DL, N->getValueType(0),
-                                                  /*isTarget=*/true)),
-                    0);
+      Val = SDValue(
+          CurDAG->getMachineNode(
+              RISCV::XORI, DL, N->getValueType(0), LHS,
+              CurDAG->getSignedTargetConstant(CVal, DL, N->getValueType(0))),
+          0);
       return true;
     }
     // If the RHS is [-2047,2048], we can use addi with -RHS to produce 0 if the
     // LHS is equal to the RHS and non-zero otherwise.
     if (isInt<12>(CVal) || CVal == 2048) {
-      Val = SDValue(CurDAG->getMachineNode(
-                        RISCV::ADDI, DL, N->getValueType(0), LHS,
-                        CurDAG->getSignedConstant(-CVal, DL, N->getValueType(0),
-                                                  /*isTarget=*/true)),
-                    0);
+      Val = SDValue(
+          CurDAG->getMachineNode(
+              RISCV::ADDI, DL, N->getValueType(0), LHS,
+              CurDAG->getSignedTargetConstant(-CVal, DL, N->getValueType(0))),
+          0);
       return true;
     }
     if (isPowerOf2_64(CVal) && Subtarget->hasStdExtZbs()) {
@@ -3412,8 +3416,7 @@ bool RISCVDAGToDAGISel::selectSimm5Shl2(SDValue N, SDValue &Simm5,
       return false;
 
     EVT Ty = N->getValueType(0);
-    Simm5 = CurDAG->getSignedConstant(Offset >> Shift, SDLoc(N), Ty,
-                                      /*isTarget=*/true);
+    Simm5 = CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(N), Ty);
     Shl2 = CurDAG->getTargetConstant(Shift, SDLoc(N), Ty);
     return true;
   }
@@ -3430,16 +3433,16 @@ bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
                                    N->getValueType(0));
   } else if (C && C->isAllOnes()) {
     // Treat all ones as VLMax.
-    VL = CurDAG->getSignedConstant(RISCV::VLMaxSentinel, SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
+    VL = CurDAG->getSignedTargetConstant(RISCV::VLMaxSentinel, SDLoc(N),
+                                         N->getValueType(0));
   } else if (isa<RegisterSDNode>(N) &&
              cast<RegisterSDNode>(N)->getReg() == RISCV::X0) {
     // All our VL operands use an operand that allows GPRNoX0 or an immediate
     // as the register class. Convert X0 to a special immediate to pass the
     // MachineVerifier. This is recognized specially by the vsetvli insertion
     // pass.
-    VL = CurDAG->getSignedConstant(RISCV::VLMaxSentinel, SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
+    VL = CurDAG->getSignedTargetConstant(RISCV::VLMaxSentinel, SDLoc(N),
+                                         N->getValueType(0));
   } else {
     VL = N;
   }
@@ -3497,8 +3500,8 @@ static bool selectVSplatImmHelper(SDValue N, SDValue &SplatVal,
   if (!ValidateImm(SplatImm))
     return false;
 
-  SplatVal = DAG.getSignedConstant(SplatImm, SDLoc(N), Subtarget.getXLenVT(),
-                                   /*isTarget=*/true);
+  SplatVal =
+      DAG.getSignedTargetConstant(SplatImm, SDLoc(N), Subtarget.getXLenVT());
   return true;
 }
 
@@ -3598,8 +3601,8 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width,
     if (!isInt<5>(ImmVal))
       return false;
 
-    Imm = CurDAG->getSignedConstant(ImmVal, SDLoc(N), Subtarget->getXLenVT(),
-                                    /*isTarget=*/true);
+    Imm = CurDAG->getSignedTargetConstant(ImmVal, SDLoc(N),
+                                          Subtarget->getXLenVT());
     return true;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5f970ffe671c6..976b2478b433e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2233,6 +2233,17 @@ MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
   return PartVT;
 }
 
+unsigned
+RISCVTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
+                                     std::optional<MVT> RegisterVT) const {
+  // Pair inline assembly operand
+  if (VT == (Subtarget.is64Bit() ? MVT::i128 : MVT::i64) && RegisterVT &&
+      *RegisterVT == MVT::Untyped)
+    return 1;
+
+  return TargetLowering::getNumRegisters(Context, VT, RegisterVT);
+}
+
 unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                            CallingConv::ID CC,
                                                            EVT VT) const {
@@ -20196,6 +20207,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(TAIL)
   NODE_NAME_CASE(SELECT_CC)
   NODE_NAME_CASE(BR_CC)
+  NODE_NAME_CASE(BuildGPRPair)
+  NODE_NAME_CASE(SplitGPRPair)
   NODE_NAME_CASE(BuildPairF64)
   NODE_NAME_CASE(SplitF64)
   NODE_NAME_CASE(ADD_LO)
@@ -20456,6 +20469,7 @@ RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
     default:
       break;
     case 'f':
+    case 'R':
       return C_RegisterClass;
     case 'I':
     case 'J':
@@ -20515,6 +20529,10 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           return std::make_pair(0U, &RISCV::GPRNoX0RegClass);
       }
       break;
+    case 'R':
+      if (VT == MVT::f64 && !Subtarget.is64Bit() && Subtarget.hasStdExtZdinx())
+        return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass);
+      return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass);
     default:
       break;
     }
@@ -20766,8 +20784,8 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint(
       if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
         uint64_t CVal = C->getSExtValue();
         if (isInt<12>(CVal))
-          Ops.push_back(DAG.getSignedConstant(
-              CVal, SDLoc(Op), Subtarget.getXLenVT(), /*isTarget=*/true));
+          Ops.push_back(DAG.getSignedTargetConstant(CVal, SDLoc(Op),
+                                                    Subtarget.getXLenVT()));
       }
       return;
     case 'J':
@@ -21360,6 +21378,23 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
     unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
   bool IsABIRegCopy = CC.has_value();
   EVT ValueVT = Val.getValueType();
+
+  MVT PairVT = Subtarget.is64Bit() ? MVT::i128 : MVT::i64;
+  if ((ValueVT == PairVT ||
+       (!Subtarget.is64Bit() && Subtarget.hasStdExtZdinx() &&
+        ValueVT == MVT::f64)) &&
+      NumParts == 1 && PartVT == MVT::Untyped) {
+    // Pairs in Inline Assembly, f64 in Inline assembly on rv32_zdinx
+    MVT XLenVT = Subtarget.getXLenVT();
+    if (ValueVT == MVT::f64)
+      Val = DAG.getBitcast(MVT::i64, Val);
+    auto [Lo, Hi] = DAG.SplitScalar(Val, DL, XLenVT, XLenVT);
+    // Always creating an MVT::Untyped part, so always use
+    // RISCVISD::BuildGPRPair.
+    Parts[0] = DAG.getNode(RISCVISD::BuildGPRPair, DL, PartVT, Lo, Hi);
+    return true;
+  }
+
   if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
       PartVT == MVT::f32) {
     // Cast the [b]f16 to i16, extend to i32, pad with ones to make a float
@@ -21368,7 +21403,7 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
     Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Val);
     Val = DAG.getNode(ISD::OR, DL, MVT::i32, Val,
                       DAG.getConstant(0xFFFF0000, DL, MVT::i32));
-    Val = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+    Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
     Parts[0] = Val;
     return true;
   }
@@ -21436,6 +21471,27 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
     SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
     MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
   bool IsABIRegCopy = CC.has_value();
+
+  MVT PairVT = Subtarget.is64Bit() ? MVT::i128 : MVT::i64;
+  if ((ValueVT == PairVT ||
+       (!Subtarget.is64Bit() && Subtarget.hasStdExtZdinx() &&
+        ValueVT == MVT::f64)) &&
+      NumParts == 1 && PartVT == MVT::Untyped) {
+    // Pairs in Inline Assembly, f64 in Inline assembly on rv32_zdinx
+    MVT XLenVT = Subtarget.getXLenVT();
+
+    SDValue Val = Parts[0];
+    // Always starting with an MVT::Untyped part, so always use
+    // RISCVISD::SplitGPRPair
+    Val = DAG.getNode(RISCVISD::SplitGPRPair, DL, DAG.getVTList(XLenVT, XLenVT),
+                      Val);
+    Val = DAG.getNode(ISD::BUILD_PAIR, DL, PairVT, Val.getValue(0),
+                      Val.getValue(1));
+    if (ValueVT == MVT::f64)
+      Val = DAG.getBitcast(ValueVT, Val);
+    return Val;
+  }
+
   if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
       PartVT == MVT::f32) {
     SDValue Val = Parts[0];
@@ -22012,6 +22068,36 @@ SDValue RISCVTargetLowering::expandIndirectJTBranch(const SDLoc &dl,
   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
 }
 
+// If an output pattern produces multiple instructions tablegen may pick an
+// arbitrary type from an instructions destination register class to use for the
+// VT of that MachineSDNode. This VT may be used to look up the representative
+// register class. If the type isn't legal, the default implementation will
+// not find a register class.
+//
+// Some integer types smaller than XLen are listed in the GPR register class to
+// support isel patterns for GISel, but are not legal in SelectionDAG. The
+// arbitrary type tablegen picks may be one of these smaller types.
+//
+// f16 and bf16 are both valid for the FPR16 or GPRF16 register class. It's
+// possible for tablegen to pick bf16 as the arbitrary type for an f16 pattern.
+std::pair<const TargetRegisterClass *, uint8_t>
+RISCVTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                             MVT VT) const {
+  switch (VT.SimpleTy) {
+  default:
+    break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    return TargetLowering::findRepresentativeClass(TRI, Subtarget.getXLenVT());
+  case MVT::bf16:
+  case MVT::f16:
+    return TargetLowering::findRepresentativeClass(TRI, MVT::f32);
+  }
+
+  return TargetLowering::findRepresentativeClass(TRI, VT);
+}
+
 namespace llvm::RISCVVIntrinsicsTable {
 
 #define GET_RISCVVIntrinsicsTable_IMPL
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9ae70d257fa44..7ada941563c1f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -44,6 +44,18 @@ enum NodeType : unsigned {
   SELECT_CC,
   BR_CC,
 
+  /// Turn a pair of `i<xlen>`s into an even-odd register pair (`untyped`).
+  /// - Output: `untyped` even-odd register pair
+  /// - Input 0: `i<xlen>` low-order bits, for even register.
+  /// - Input 1: `i<xlen>` high-order bits, for odd register.
+  BuildGPRPair,
+
+  /// Turn an even-odd register pair (`untyped`) into a pair of `i<xlen>`s.
+  /// - Output 0: `i<xlen>` low-order bits, from even register.
+  /// - Output 1: `i<xlen>` high-order bits, from odd register.
+  /// - Input: `untyped` even-odd register pair
+  SplitGPRPair,
+
   /// Turns a pair of `i32`s into an `f64`. Needed for rv32d/ilp32.
   /// - Output: `f64`.
   /// - Input 0: low-order bits (31-0) (as `i32`), for even register.
@@ -547,6 +559,11 @@ class RISCVTargetLowering : public TargetLowering {
   MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
                                     EVT VT) const override;
 
+  /// Return the number of registers for a given MVT, for inline assembly
+  unsigned
+  getNumRegisters(LLVMContext &Context, EVT VT,
+                  std::optional<MVT> RegisterVT = std::nullopt) const override;
+
   /// Return the number of registers for a given MVT, ensuring vectors are
   /// treated as a series of gpr sized integers.
   unsigned getNumRegistersForCallingConv(LLVMContext &Context,
@@ -1051,6 +1068,9 @@ class RISCVTargetLowering : public TargetLowering {
 
   SDValue emitFlushICache(SelectionDAG &DAG, SDValue InChain, SDValue Start,
                           SDValue End, SDValue Flags, SDLoc DL) const;
+
+  std::pair<const TargetRegisterClass *, uint8_t>
+  findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override;
 };
 
 namespace RISCVVIntrinsicsTable {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 1908f5e5dede8..5747f05ffafd4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -415,14 +415,14 @@ def AddrRegImm : ComplexPattern<iPTR, 2, "SelectAddrRegImm">;
 
 // Return the negation of an immediate value.
 def NegImm : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(-N->getSExtValue(), SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(-N->getSExtValue(), SDLoc(N),
+                                         N->getValueType(0));
 }]>;
 
 // Return an immediate value minus 32.
 def ImmSub32 : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(N->getSExtValue() - 32, SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(N->getSExtValue() - 32, SDLoc(N),
+                                         N->getValueType(0));
 }]>;
 
 // Return an immediate subtracted from XLen.
@@ -454,16 +454,15 @@ def AddiPair : PatLeaf<(imm), [{
 def AddiPairImmSmall : SDNodeXForm<imm, [{
   int64_t Imm = N->getSExtValue();
   int64_t Adj = N->getSExtValue() < 0 ? -2048 : 2047;
-  return CurDAG->getSignedConstant(Imm - Adj, SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(Imm - Adj, SDLoc(N),
+                                         N->getValueType(0));
 }]>;
 
 // Return -2048 if immediate is negative or 2047 if positive. These are the
 // largest simm12 values.
 def AddiPairImmLarge : SDNodeXForm<imm, [{
   int64_t Imm = N->getSExtValue() < 0 ? -2048 : 2047;
-  return CurDAG->getSignedConstant(Imm, SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(Imm, SDLoc(N), N->getValueType(0));
 }]>;
 
 def TrailingZeros : SDNodeXForm<imm, [{
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 0de43c458f22c..b01af468d9ea2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -36,7 +36,7 @@ def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmRV32Zdinx">;
 def GPRPairAsFPR : AsmOperandClass {
   let Name = "GPRPairAsFPR";
   let ParserMethod = "parseGPRPairAsFPR64";
-  let PredicateMethod = "isGPRPairAsFPR";
+  let PredicateMethod = "isGPRPairAsFPR64";
   let RenderMethod = "addRegOperands";
 }
 
@@ -457,16 +457,16 @@ def : PatSetCC<FPR64INX, any_fsetccs, SETOLE, FLE_D_INX, f64>;
 
 let Predicates = [HasStdExtZdinx, IsRV32] in {
 // Match signaling FEQ_D
-def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETEQ)),
+def : Pat<(XLenVT (strict_fsetccs (f64 FPR64IN32X:$rs1), FPR64IN32X:$rs2, SETEQ)),
           (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)),
                (XLenVT (FLE_D_IN32X $rs2, $rs1)))>;
-def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs2, SETOEQ)),
+def : Pat<(XLenVT (strict_fsetccs (f64 FPR64IN32X:$rs1), FPR64IN32X:$rs2, SETOEQ)),
           (AND (XLenVT (FLE_D_IN32X $rs1, $rs2)),
                (XLenVT (FLE_D_IN32X $rs2, $rs1)))>;
 // If both operands are the same, use a single FLE.
-def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs1, SETEQ)),
+def : Pat<(XLenVT (strict_fsetccs (f64 FPR64IN32X:$rs1), FPR64IN32X:$rs1, SETEQ)),
           (FLE_D_IN32X $rs1, $rs1)>;
-def : Pat<(XLenVT (strict_fsetccs FPR64IN32X:$rs1, FPR64IN32X:$rs1, SETOEQ)),
+def : Pat<(XLenVT (strict_fsetccs (f64 FPR64IN32X:$rs1), FPR64IN32X:$rs1, SETOEQ)),
           (FLE_D_IN32X $rs1, $rs1)>;
 
 def : PatSetCC<FPR64IN32X, any_fsetccs, SETLT,  FLT_D_IN32X, f64>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 2654a54c95d1b..6c4e41711440e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -104,8 +104,8 @@ def vec_rm : RISCVOp {
 def VLOp : ComplexPattern<XLenVT, 1, "selectVLOp">;
 
 def DecImm : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(N->getSExtValue() - 1, SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(N->getSExtValue() - 1, SDLoc(N),
+                                         N->getValueType(0));
 }]>;
 
 defvar TAIL_AGNOSTIC = 1;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 90747a6b745cf..99186ec7360e7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -769,7 +769,9 @@ defm : LdIdxPat<zextloadi16, TH_LRHU>;
 
 defm : StIdxPat<truncstorei8, TH_SRB, GPR>;
 defm : StIdxPat<truncstorei16, TH_SRH, GPR>;
+}
 
+let Predicates = [HasVendorXTHeadMemIdx, IsRV32] in {
 defm : LdIdxPat<load, TH_LRW, i32>;
 defm : StIdxPat<store, TH_SRW, GPR, i32>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index ccb851f9322d6..c57e7af3c5614 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -145,8 +145,18 @@ def BCLRIANDIMask : PatLeaf<(imm), [{
 }]>;
 
 def BCLRIANDIMaskLow : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant((N->getZExtValue() & 0x7ff) | ~0x7ffull,
-                                   SDLoc(N), N->getValueType(0), /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant((N->getZExtValue() & 0x7ff) | ~0x7ffull,
+                                         SDLoc(N), N->getValueType(0));
+}]>;
+
+def SimmShiftRightBy2XForm : SDNodeXForm<imm, [{
+  return CurDAG->getSignedTargetConstant(N->getSExtValue() >> 2, SDLoc(N),
+                                         N->getValueType(0));
+}]>;
+
+def SimmShiftRightBy3XForm : SDNodeXForm<imm, [{
+  return CurDAG->getSignedTargetConstant(N->getSExtValue() >> 3, SDLoc(N),
+                                         N->getValueType(0));
 }]>;
 
 def CSImm12MulBy4 : PatLeaf<(imm), [{
@@ -155,7 +165,7 @@ def CSImm12MulBy4 : PatLeaf<(imm), [{
   int64_t C = N->getSExtValue();
   // Skip if C is simm12, an lui, or can be optimized by the PatLeaf AddiPair.
   return !isInt<13>(C) && !isShiftedInt<20, 12>(C) && isShiftedInt<12, 2>(C);
-}]>;
+}], SimmShiftRightBy2XForm>;
 
 def CSImm12MulBy8 : PatLeaf<(imm), [{
   if (!N->hasOneUse())
@@ -164,17 +174,7 @@ def CSImm12MulBy8 : PatLeaf<(imm), [{
   // Skip if C is simm12, an lui or can be optimized by the PatLeaf AddiPair or
   // CSImm12MulBy4.
   return !isInt<14>(C) && !isShiftedInt<20, 12>(C) && isShiftedInt<12, 3>(C);
-}]>;
-
-def SimmShiftRightBy2XForm : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(N->getSExtValue() >> 2, SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
-}]>;
-
-def SimmShiftRightBy3XForm : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(N->getSExtValue() >> 3, SDLoc(N),
-                                   N->getValueType(0), /*isTarget=*/true);
-}]>;
+}], SimmShiftRightBy3XForm>;
 
 // Pattern to exclude simm12 immediates from matching, namely `non_imm12`.
 // GISel currently doesn't support PatFrag for leaf nodes, so `non_imm12`
@@ -654,10 +654,10 @@ foreach i = {1,2,3} in {
 }
 
 def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy4:$i),
-          (SH2ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))),
+          (SH2ADD (XLenVT (ADDI (XLenVT X0), CSImm12MulBy4:$i)),
                   GPR:$r)>;
 def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy8:$i),
-          (SH3ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))),
+          (SH3ADD (XLenVT (ADDI (XLenVT X0), CSImm12MulBy8:$i)),
                   GPR:$r)>;
 
 } // Predicates = [HasStdExtZba]
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index e52a856183678..e96281bb46950 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -407,6 +407,28 @@ def SYNTACORE_SCR7 : RISCVProcessorModel<"syntacore-scr7",
                                                FeatureStdExtZkn],
                                               [TuneNoDefaultUnroll, FeaturePostRAScheduler]>;
 
+def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
+                                                 NoSchedModel,
+                                                 !listconcat(RVA23S64Features,
+                                                 [FeatureStdExtSmaia,
+                                                  FeatureStdExtSsaia,
+                                                  FeatureStdExtSscofpmf,
+                                                  FeatureStdExtSsstrict,
+                                                  FeatureStdExtZfbfmin,
+                                                  FeatureStdExtZfh,
+                                                  FeatureStdExtZicsr,
+                                                  FeatureStdExtZvbc,
+                                                  FeatureStdExtZvfbfmin,
+                                                  FeatureStdExtZvfbfwma,
+                                                  FeatureStdExtZvfh,
+                                                  FeatureStdExtZvkng,
+                                                  FeatureStdExtZvl256b,
+                                                  FeatureUnalignedScalarMem,
+                                                  FeatureUnalignedVectorMem]),
+                                                 [TuneNoDefaultUnroll,
+                                                  TuneOptimizedZeroStrideLoad,
+                                                  FeaturePostRAScheduler]>;
+
 def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                             NoSchedModel,
                                             [Feature64Bit,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 803c3ec195106..7eb93973459c0 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -208,6 +208,8 @@ let RegAltNameIndices = [ABIRegAltName] in {
 
 def XLenVT : ValueTypeByHwMode<[RV32, RV64],
                                [i32,  i64]>;
+defvar XLenPairVT = untyped;
+
 // Allow f64 in GPR for ZDINX on RV64.
 def XLenFVT : ValueTypeByHwMode<[RV64],
                                 [f64]>;
@@ -323,7 +325,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
 
 let RegInfos = XLenPairRI,
     DecoderMethod = "DecodeGPRPairRegisterClass" in {
-def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add
+def GPRPair : RISCVRegisterClass<[XLenPairVT, XLenPairFVT], 64, (add
     X10_X11, X12_X13, X14_X15, X16_X17,
     X6_X7,
     X28_X29, X30_X31,
@@ -332,11 +334,11 @@ def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add
     X0_Pair, X2_X3, X4_X5
 )>;
 
-def GPRPairNoX0 : RISCVRegisterClass<[XLenPairFVT], 64, (sub GPRPair, X0_Pair)>;
+def GPRPairNoX0 : RISCVRegisterClass<[XLenPairVT, XLenPairFVT], 64, (sub GPRPair, X0_Pair)>;
 } // let RegInfos = XLenPairRI, DecoderMethod = "DecodeGPRPairRegisterClass"
 
 let RegInfos = XLenPairRI in
-def GPRPairC : RISCVRegisterClass<[XLenPairFVT], 64, (add
+def GPRPairC : RISCVRegisterClass<[XLenPairVT, XLenPairFVT], 64, (add
   X10_X11, X12_X13, X14_X15, X8_X9
 )>;
 
@@ -462,7 +464,6 @@ def GPRF32C : RISCVRegisterClass<[f32], 32, (add (sequence "X%u_W", 10, 15),
                                                  (sequence "X%u_W", 8, 9))>;
 def GPRF32NoX0 : RISCVRegisterClass<[f32], 32, (sub GPRF32, X0_W)>;
 
-
 //===----------------------------------------------------------------------===//
 // Vector type mapping to LLVM types.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 4edaeef354a59..2b16dcbcd8695 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1104,26 +1104,6 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return Cost * LT.first;
     break;
   }
-  // vp integer arithmetic ops.
-  case Intrinsic::vp_add:
-  case Intrinsic::vp_and:
-  case Intrinsic::vp_ashr:
-  case Intrinsic::vp_lshr:
-  case Intrinsic::vp_mul:
-  case Intrinsic::vp_or:
-  case Intrinsic::vp_sdiv:
-  case Intrinsic::vp_shl:
-  case Intrinsic::vp_srem:
-  case Intrinsic::vp_sub:
-  case Intrinsic::vp_udiv:
-  case Intrinsic::vp_urem:
-  case Intrinsic::vp_xor:
-  // vp float arithmetic ops.
-  case Intrinsic::vp_fadd:
-  case Intrinsic::vp_fsub:
-  case Intrinsic::vp_fmul:
-  case Intrinsic::vp_fdiv:
-  case Intrinsic::vp_frem:
   case Intrinsic::vp_fneg: {
     std::optional<unsigned> FOp =
         VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
@@ -1164,23 +1144,6 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0], ICA.getReturnType(),
                               UI->getPredicate(), CostKind);
   }
-  // vp load/store
-  case Intrinsic::vp_load:
-  case Intrinsic::vp_store: {
-    if (!ICA.getInst())
-      break;
-    Intrinsic::ID IID = ICA.getID();
-    std::optional<unsigned> FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID);
-    assert(FOp.has_value());
-    auto *UI = cast<VPIntrinsic>(ICA.getInst());
-    if (ICA.getID() == Intrinsic::vp_load)
-      return getMemoryOpCost(
-          *FOp, ICA.getReturnType(), UI->getPointerAlignment(),
-          UI->getOperand(0)->getType()->getPointerAddressSpace(), CostKind);
-    return getMemoryOpCost(
-        *FOp, ICA.getArgTypes()[0], UI->getPointerAlignment(),
-        UI->getOperand(1)->getType()->getPointerAddressSpace(), CostKind);
-  }
   case Intrinsic::vp_select: {
     Intrinsic::ID IID = ICA.getID();
     std::optional<unsigned> FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID);
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 6646795408fea..8f38d4b8307da 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -102,6 +102,7 @@ class SPIRVPassConfig : public TargetPassConfig {
   SPIRVTargetMachine &getSPIRVTargetMachine() const {
     return getTM<SPIRVTargetMachine>();
   }
+  void addMachineSSAOptimization() override;
   void addIRPasses() override;
   void addISelPrepare() override;
 
@@ -129,6 +130,16 @@ FunctionPass *SPIRVPassConfig::createTargetRegisterAllocator(bool) {
   return nullptr;
 }
 
+// Disable passes that may break CFG.
+void SPIRVPassConfig::addMachineSSAOptimization() {
+  // Some standard passes that optimize machine instructions in SSA form uses
+  // MI.isPHI() that doesn't account for OpPhi in SPIR-V and so are able to
+  // break the CFG (e.g., MachineSink).
+  disablePass(&MachineSinkingID);
+
+  TargetPassConfig::addMachineSSAOptimization();
+}
+
 // Disable passes that break from assuming no virtual registers exist.
 void SPIRVPassConfig::addPostRegAlloc() {
   // Do not work with vregs instead of physical regs.
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index eb6a852980079..b459fbcad909f 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -44,8 +44,8 @@ def ULO7 : SDNodeXForm<imm, [{
                                    SDLoc(N), MVT::i32);
 }]>;
 def LO7 : SDNodeXForm<imm, [{
-  return CurDAG->getSignedConstant(SignExtend64(N->getSExtValue(), 7),
-                                   SDLoc(N), MVT::i32, /*isTarget=*/true);
+  return CurDAG->getSignedTargetConstant(SignExtend64(N->getSExtValue(), 7),
+                                         SDLoc(N), MVT::i32);
 }]>;
 def MIMM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(val2MImm(getImmVal(N)),
diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
index d425a0d507524..ee456a11d5844 100644
--- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
@@ -1254,16 +1254,16 @@ bool X86InstructionSelector::selectExtract(MachineInstr &I,
 
   if (SrcTy.getSizeInBits() == 256 && DstTy.getSizeInBits() == 128) {
     if (HasVLX)
-      I.setDesc(TII.get(X86::VEXTRACTF32x4Z256rri));
+      I.setDesc(TII.get(X86::VEXTRACTF32X4Z256rri));
     else if (HasAVX)
       I.setDesc(TII.get(X86::VEXTRACTF128rri));
     else
       return false;
   } else if (SrcTy.getSizeInBits() == 512 && HasAVX512) {
     if (DstTy.getSizeInBits() == 128)
-      I.setDesc(TII.get(X86::VEXTRACTF32x4Zrri));
+      I.setDesc(TII.get(X86::VEXTRACTF32X4Zrri));
     else if (DstTy.getSizeInBits() == 256)
-      I.setDesc(TII.get(X86::VEXTRACTF64x4Zrri));
+      I.setDesc(TII.get(X86::VEXTRACTF64X4Zrri));
     else
       return false;
   } else
@@ -1387,16 +1387,16 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
 
   if (DstTy.getSizeInBits() == 256 && InsertRegTy.getSizeInBits() == 128) {
     if (HasVLX)
-      I.setDesc(TII.get(X86::VINSERTF32x4Z256rri));
+      I.setDesc(TII.get(X86::VINSERTF32X4Z256rri));
     else if (HasAVX)
       I.setDesc(TII.get(X86::VINSERTF128rri));
     else
       return false;
   } else if (DstTy.getSizeInBits() == 512 && HasAVX512) {
     if (InsertRegTy.getSizeInBits() == 128)
-      I.setDesc(TII.get(X86::VINSERTF32x4Zrri));
+      I.setDesc(TII.get(X86::VINSERTF32X4Zrri));
     else if (InsertRegTy.getSizeInBits() == 256)
-      I.setDesc(TII.get(X86::VINSERTF64x4Zrri));
+      I.setDesc(TII.get(X86::VINSERTF64X4Zrri));
     else
       return false;
   } else
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 0f76808f55bc7..b67c573e217ba 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -318,18 +318,18 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
   case X86::VPCMPWZ128rmik:  case X86::VPCMPWZ128rrik:
   case X86::VPCMPWZ256rmik:  case X86::VPCMPWZ256rrik:
   case X86::VPCMPWZrmik:     case X86::VPCMPWZrrik:
-  case X86::VPCMPDZ128rmib:  case X86::VPCMPDZ128rmibk:
-  case X86::VPCMPDZ256rmib:  case X86::VPCMPDZ256rmibk:
-  case X86::VPCMPDZrmib:     case X86::VPCMPDZrmibk:
-  case X86::VPCMPQZ128rmib:  case X86::VPCMPQZ128rmibk:
-  case X86::VPCMPQZ256rmib:  case X86::VPCMPQZ256rmibk:
-  case X86::VPCMPQZrmib:     case X86::VPCMPQZrmibk:
-  case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
-  case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
-  case X86::VPCMPUDZrmib:    case X86::VPCMPUDZrmibk:
-  case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
-  case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
-  case X86::VPCMPUQZrmib:    case X86::VPCMPUQZrmibk:
+  case X86::VPCMPDZ128rmbi:  case X86::VPCMPDZ128rmbik:
+  case X86::VPCMPDZ256rmbi:  case X86::VPCMPDZ256rmbik:
+  case X86::VPCMPDZrmbi:     case X86::VPCMPDZrmbik:
+  case X86::VPCMPQZ128rmbi:  case X86::VPCMPQZ128rmbik:
+  case X86::VPCMPQZ256rmbi:  case X86::VPCMPQZ256rmbik:
+  case X86::VPCMPQZrmbi:     case X86::VPCMPQZrmbik:
+  case X86::VPCMPUDZ128rmbi: case X86::VPCMPUDZ128rmbik:
+  case X86::VPCMPUDZ256rmbi: case X86::VPCMPUDZ256rmbik:
+  case X86::VPCMPUDZrmbi:    case X86::VPCMPUDZrmbik:
+  case X86::VPCMPUQZ128rmbi: case X86::VPCMPUQZ128rmbik:
+  case X86::VPCMPUQZ256rmbi: case X86::VPCMPUQZ256rmbik:
+  case X86::VPCMPUQZrmbi:    case X86::VPCMPUQZrmbik:
     if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) {
       OS << '\t';
       printVPCMPMnemonic(MI, OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 90222278d1ad6..a57b1335d1437 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -207,7 +207,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
   case MCSymbolRefExpr::VK_GOTPCREL:
     checkIs32(Ctx, Loc, Type);
     // Older versions of ld.bfd/ld.gold/lld
-    // do not support GOTPCRELX/REX_GOTPCRELX/REX2_GOTPCRELX,
+    // do not support GOTPCRELX/REX_GOTPCRELX/CODE_4_GOTPCRELX,
     // and we want to keep back-compatibility.
     if (!Ctx.getTargetOptions()->X86RelaxRelocations)
       return ELF::R_X86_64_GOTPCREL;
@@ -221,7 +221,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
       return ELF::R_X86_64_REX_GOTPCRELX;
     case X86::reloc_riprel_4byte_relax_rex2:
     case X86::reloc_riprel_4byte_movq_load_rex2:
-      return ELF::R_X86_64_REX2_GOTPCRELX;
+      return ELF::R_X86_64_CODE_4_GOTPCRELX;
     }
     llvm_unreachable("unexpected relocation type!");
   case MCSymbolRefExpr::VK_GOTPCREL_NORELAX:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.cpp
index ba503756cf41a..1e9d44068b3f3 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86EncodingOptimization.cpp
@@ -223,38 +223,38 @@ bool X86::optimizeVPCMPWithImmediateOneOrSix(MCInst &MI) {
     FROM_TO(VPCMPBZrri, VPCMPEQBZrr, VPCMPGTBZrr)
     FROM_TO(VPCMPBZrrik, VPCMPEQBZrrk, VPCMPGTBZrrk)
     FROM_TO(VPCMPDZ128rmi, VPCMPEQDZ128rm, VPCMPGTDZ128rm)
-    FROM_TO(VPCMPDZ128rmib, VPCMPEQDZ128rmb, VPCMPGTDZ128rmb)
-    FROM_TO(VPCMPDZ128rmibk, VPCMPEQDZ128rmbk, VPCMPGTDZ128rmbk)
+    FROM_TO(VPCMPDZ128rmbi, VPCMPEQDZ128rmb, VPCMPGTDZ128rmb)
+    FROM_TO(VPCMPDZ128rmbik, VPCMPEQDZ128rmbk, VPCMPGTDZ128rmbk)
     FROM_TO(VPCMPDZ128rmik, VPCMPEQDZ128rmk, VPCMPGTDZ128rmk)
     FROM_TO(VPCMPDZ128rri, VPCMPEQDZ128rr, VPCMPGTDZ128rr)
     FROM_TO(VPCMPDZ128rrik, VPCMPEQDZ128rrk, VPCMPGTDZ128rrk)
     FROM_TO(VPCMPDZ256rmi, VPCMPEQDZ256rm, VPCMPGTDZ256rm)
-    FROM_TO(VPCMPDZ256rmib, VPCMPEQDZ256rmb, VPCMPGTDZ256rmb)
-    FROM_TO(VPCMPDZ256rmibk, VPCMPEQDZ256rmbk, VPCMPGTDZ256rmbk)
+    FROM_TO(VPCMPDZ256rmbi, VPCMPEQDZ256rmb, VPCMPGTDZ256rmb)
+    FROM_TO(VPCMPDZ256rmbik, VPCMPEQDZ256rmbk, VPCMPGTDZ256rmbk)
     FROM_TO(VPCMPDZ256rmik, VPCMPEQDZ256rmk, VPCMPGTDZ256rmk)
     FROM_TO(VPCMPDZ256rri, VPCMPEQDZ256rr, VPCMPGTDZ256rr)
     FROM_TO(VPCMPDZ256rrik, VPCMPEQDZ256rrk, VPCMPGTDZ256rrk)
     FROM_TO(VPCMPDZrmi, VPCMPEQDZrm, VPCMPGTDZrm)
-    FROM_TO(VPCMPDZrmib, VPCMPEQDZrmb, VPCMPGTDZrmb)
-    FROM_TO(VPCMPDZrmibk, VPCMPEQDZrmbk, VPCMPGTDZrmbk)
+    FROM_TO(VPCMPDZrmbi, VPCMPEQDZrmb, VPCMPGTDZrmb)
+    FROM_TO(VPCMPDZrmbik, VPCMPEQDZrmbk, VPCMPGTDZrmbk)
     FROM_TO(VPCMPDZrmik, VPCMPEQDZrmk, VPCMPGTDZrmk)
     FROM_TO(VPCMPDZrri, VPCMPEQDZrr, VPCMPGTDZrr)
     FROM_TO(VPCMPDZrrik, VPCMPEQDZrrk, VPCMPGTDZrrk)
     FROM_TO(VPCMPQZ128rmi, VPCMPEQQZ128rm, VPCMPGTQZ128rm)
-    FROM_TO(VPCMPQZ128rmib, VPCMPEQQZ128rmb, VPCMPGTQZ128rmb)
-    FROM_TO(VPCMPQZ128rmibk, VPCMPEQQZ128rmbk, VPCMPGTQZ128rmbk)
+    FROM_TO(VPCMPQZ128rmbi, VPCMPEQQZ128rmb, VPCMPGTQZ128rmb)
+    FROM_TO(VPCMPQZ128rmbik, VPCMPEQQZ128rmbk, VPCMPGTQZ128rmbk)
     FROM_TO(VPCMPQZ128rmik, VPCMPEQQZ128rmk, VPCMPGTQZ128rmk)
     FROM_TO(VPCMPQZ128rri, VPCMPEQQZ128rr, VPCMPGTQZ128rr)
     FROM_TO(VPCMPQZ128rrik, VPCMPEQQZ128rrk, VPCMPGTQZ128rrk)
     FROM_TO(VPCMPQZ256rmi, VPCMPEQQZ256rm, VPCMPGTQZ256rm)
-    FROM_TO(VPCMPQZ256rmib, VPCMPEQQZ256rmb, VPCMPGTQZ256rmb)
-    FROM_TO(VPCMPQZ256rmibk, VPCMPEQQZ256rmbk, VPCMPGTQZ256rmbk)
+    FROM_TO(VPCMPQZ256rmbi, VPCMPEQQZ256rmb, VPCMPGTQZ256rmb)
+    FROM_TO(VPCMPQZ256rmbik, VPCMPEQQZ256rmbk, VPCMPGTQZ256rmbk)
     FROM_TO(VPCMPQZ256rmik, VPCMPEQQZ256rmk, VPCMPGTQZ256rmk)
     FROM_TO(VPCMPQZ256rri, VPCMPEQQZ256rr, VPCMPGTQZ256rr)
     FROM_TO(VPCMPQZ256rrik, VPCMPEQQZ256rrk, VPCMPGTQZ256rrk)
     FROM_TO(VPCMPQZrmi, VPCMPEQQZrm, VPCMPGTQZrm)
-    FROM_TO(VPCMPQZrmib, VPCMPEQQZrmb, VPCMPGTQZrmb)
-    FROM_TO(VPCMPQZrmibk, VPCMPEQQZrmbk, VPCMPGTQZrmbk)
+    FROM_TO(VPCMPQZrmbi, VPCMPEQQZrmb, VPCMPGTQZrmb)
+    FROM_TO(VPCMPQZrmbik, VPCMPEQQZrmbk, VPCMPGTQZrmbk)
     FROM_TO(VPCMPQZrmik, VPCMPEQQZrmk, VPCMPGTQZrmk)
     FROM_TO(VPCMPQZrri, VPCMPEQQZrr, VPCMPGTQZrr)
     FROM_TO(VPCMPQZrrik, VPCMPEQQZrrk, VPCMPGTQZrrk)
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 49e8bab4c0363..70c71273d270f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -41,18 +41,18 @@ using namespace llvm;
   CASE_MASKZ_INS_COMMON(Inst, Suffix, src)
 
 #define CASE_FPCLASS_PACKED(Inst, src)    \
-  CASE_AVX_INS_COMMON(Inst, Z, r##src)    \
-  CASE_AVX_INS_COMMON(Inst, Z256, r##src) \
-  CASE_AVX_INS_COMMON(Inst, Z128, r##src) \
-  CASE_MASK_INS_COMMON(Inst, Z, r##src)
+  CASE_AVX_INS_COMMON(Inst, Z, src##i)    \
+  CASE_AVX_INS_COMMON(Inst, Z256, src##i) \
+  CASE_AVX_INS_COMMON(Inst, Z128, src##i) \
+  CASE_MASK_INS_COMMON(Inst, Z, src##i)
 
 #define CASE_FPCLASS_PACKED_MEM(Inst) \
   CASE_FPCLASS_PACKED(Inst, m)        \
   CASE_FPCLASS_PACKED(Inst, mb)
 
 #define CASE_FPCLASS_SCALAR(Inst, src)  \
-  CASE_AVX_INS_COMMON(Inst, Z, r##src)  \
-  CASE_MASK_INS_COMMON(Inst, Z, r##src)
+  CASE_AVX_INS_COMMON(Inst, Z, src##i)  \
+  CASE_MASK_INS_COMMON(Inst, Z, src##i)
 
 #define CASE_PTERNLOG(Inst, src)                                               \
   CASE_AVX512_INS_COMMON(Inst, Z, r##src##i)                                   \
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 51b82321d679b..fafcc737ff983 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -168,9 +168,9 @@ void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI,
   case X86::VPCMPDZ128rmik: case X86::VPCMPDZ128rrik:
   case X86::VPCMPDZ256rmik: case X86::VPCMPDZ256rrik:
   case X86::VPCMPDZrmik:    case X86::VPCMPDZrrik:
-  case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
-  case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
-  case X86::VPCMPDZrmib:    case X86::VPCMPDZrmibk:
+  case X86::VPCMPDZ128rmbi: case X86::VPCMPDZ128rmbik:
+  case X86::VPCMPDZ256rmbi: case X86::VPCMPDZ256rmbik:
+  case X86::VPCMPDZrmbi:    case X86::VPCMPDZrmbik:
     OS << "d\t";
     break;
   case X86::VPCMPQZ128rmi:  case X86::VPCMPQZ128rri:
@@ -179,9 +179,9 @@ void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI,
   case X86::VPCMPQZ128rmik: case X86::VPCMPQZ128rrik:
   case X86::VPCMPQZ256rmik: case X86::VPCMPQZ256rrik:
   case X86::VPCMPQZrmik:    case X86::VPCMPQZrrik:
-  case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
-  case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
-  case X86::VPCMPQZrmib:    case X86::VPCMPQZrmibk:
+  case X86::VPCMPQZ128rmbi: case X86::VPCMPQZ128rmbik:
+  case X86::VPCMPQZ256rmbi: case X86::VPCMPQZ256rmbik:
+  case X86::VPCMPQZrmbi:    case X86::VPCMPQZrmbik:
     OS << "q\t";
     break;
   case X86::VPCMPUBZ128rmi:  case X86::VPCMPUBZ128rri:
@@ -198,9 +198,9 @@ void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI,
   case X86::VPCMPUDZ128rmik: case X86::VPCMPUDZ128rrik:
   case X86::VPCMPUDZ256rmik: case X86::VPCMPUDZ256rrik:
   case X86::VPCMPUDZrmik:    case X86::VPCMPUDZrrik:
-  case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
-  case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
-  case X86::VPCMPUDZrmib:    case X86::VPCMPUDZrmibk:
+  case X86::VPCMPUDZ128rmbi: case X86::VPCMPUDZ128rmbik:
+  case X86::VPCMPUDZ256rmbi: case X86::VPCMPUDZ256rmbik:
+  case X86::VPCMPUDZrmbi:    case X86::VPCMPUDZrmbik:
     OS << "ud\t";
     break;
   case X86::VPCMPUQZ128rmi:  case X86::VPCMPUQZ128rri:
@@ -209,9 +209,9 @@ void X86InstPrinterCommon::printVPCMPMnemonic(const MCInst *MI,
   case X86::VPCMPUQZ128rmik: case X86::VPCMPUQZ128rrik:
   case X86::VPCMPUQZ256rmik: case X86::VPCMPUQZ256rrik:
   case X86::VPCMPUQZrmik:    case X86::VPCMPUQZrrik:
-  case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
-  case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
-  case X86::VPCMPUQZrmib:    case X86::VPCMPUQZrmibk:
+  case X86::VPCMPUQZ128rmbi: case X86::VPCMPUQZ128rmbik:
+  case X86::VPCMPUQZ256rmbi: case X86::VPCMPUQZ256rmbik:
+  case X86::VPCMPUQZrmbi:    case X86::VPCMPUQZrmbik:
     OS << "uq\t";
     break;
   case X86::VPCMPUWZ128rmi:  case X86::VPCMPUWZ128rri:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index c2b284ad924d0..680092679c903 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -295,18 +295,18 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
   case X86::VPCMPWZ128rmik:  case X86::VPCMPWZ128rrik:
   case X86::VPCMPWZ256rmik:  case X86::VPCMPWZ256rrik:
   case X86::VPCMPWZrmik:     case X86::VPCMPWZrrik:
-  case X86::VPCMPDZ128rmib:  case X86::VPCMPDZ128rmibk:
-  case X86::VPCMPDZ256rmib:  case X86::VPCMPDZ256rmibk:
-  case X86::VPCMPDZrmib:     case X86::VPCMPDZrmibk:
-  case X86::VPCMPQZ128rmib:  case X86::VPCMPQZ128rmibk:
-  case X86::VPCMPQZ256rmib:  case X86::VPCMPQZ256rmibk:
-  case X86::VPCMPQZrmib:     case X86::VPCMPQZrmibk:
-  case X86::VPCMPUDZ128rmib: case X86::VPCMPUDZ128rmibk:
-  case X86::VPCMPUDZ256rmib: case X86::VPCMPUDZ256rmibk:
-  case X86::VPCMPUDZrmib:    case X86::VPCMPUDZrmibk:
-  case X86::VPCMPUQZ128rmib: case X86::VPCMPUQZ128rmibk:
-  case X86::VPCMPUQZ256rmib: case X86::VPCMPUQZ256rmibk:
-  case X86::VPCMPUQZrmib:    case X86::VPCMPUQZrmibk:
+  case X86::VPCMPDZ128rmbi:  case X86::VPCMPDZ128rmbik:
+  case X86::VPCMPDZ256rmbi:  case X86::VPCMPDZ256rmbik:
+  case X86::VPCMPDZrmbi:     case X86::VPCMPDZrmbik:
+  case X86::VPCMPQZ128rmbi:  case X86::VPCMPQZ128rmbik:
+  case X86::VPCMPQZ256rmbi:  case X86::VPCMPQZ256rmbik:
+  case X86::VPCMPQZrmbi:     case X86::VPCMPQZrmbik:
+  case X86::VPCMPUDZ128rmbi: case X86::VPCMPUDZ128rmbik:
+  case X86::VPCMPUDZ256rmbi: case X86::VPCMPUDZ256rmbik:
+  case X86::VPCMPUDZrmbi:    case X86::VPCMPUDZrmbik:
+  case X86::VPCMPUQZ128rmbi: case X86::VPCMPUQZ128rmbik:
+  case X86::VPCMPUQZ256rmbi: case X86::VPCMPUQZ256rmbik:
+  case X86::VPCMPUQZrmbi:    case X86::VPCMPUQZrmbik:
     if ((Imm >= 0 && Imm <= 2) || (Imm >= 4 && Imm <= 6)) {
       OS << '\t';
       printVPCMPMnemonic(MI, OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 206436191c258..f9916fa82b1ff 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -641,7 +641,7 @@ void X86MCCodeEmitter::emitMemModRMByte(
       case X86::MOV64rm:
         // movq loads is a subset of reloc_riprel_4byte_relax_rex/rex2. It is a
         // special case because COFF and Mach-O don't support ELF's more
-        // flexible R_X86_64_REX_GOTPCRELX/R_X86_64_REX2_GOTPCRELX relaxation.
+        // flexible R_X86_64_REX_GOTPCRELX/R_X86_64_CODE_4_GOTPCRELX relaxation.
         return Kind == REX2 ? X86::reloc_riprel_4byte_movq_load_rex2
                             : X86::reloc_riprel_4byte_movq_load;
       case X86::ADC32rm:
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 72de0e0e8761f..0641dca07a890 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -313,8 +313,7 @@ namespace {
         Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
                                              AM.SymbolFlags);
       else
-        Disp =
-            CurDAG->getSignedConstant(AM.Disp, DL, MVT::i32, /*isTarget=*/true);
+        Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
 
       if (AM.Segment.getNode())
         Segment = AM.Segment;
@@ -3775,8 +3774,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
       }
 
       if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
-        Operand = CurDAG->getSignedConstant(OperandV, SDLoc(Node), MemVT,
-                                            /*isTarget=*/true);
+        Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
         NewOpc = SelectImmOpcode(Opc);
       }
     }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c984c5b6da873..fea66e9582cfb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9926,6 +9926,43 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
   return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
 
+// Canonicalize SHUFPD mask to improve chances of further folding.
+// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
+static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
+  assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
+         "Unexpected SHUFPD mask size");
+  assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
+         "Unexpected SHUFPD mask elements");
+
+  // If the mask only uses one non-undef element, then fully 'splat' it to
+  // improve later broadcast matching.
+  int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
+  assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
+         "All undef shuffle mask");
+
+  int FirstElt = Mask[FirstIndex];
+  if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
+      count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
+    unsigned Imm = 0;
+    for (unsigned I = 0, E = Mask.size(); I != E; ++I)
+      Imm |= FirstElt << I;
+    return Imm;
+  }
+
+  // Attempt to keep any undef elements in place to improve chances of the
+  // shuffle becoming a (commutative) blend.
+  unsigned Imm = 0;
+  for (unsigned I = 0, E = Mask.size(); I != E; ++I)
+    Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
+
+  return Imm;
+}
+
+static SDValue getSHUFPDImmForMask(ArrayRef<int> Mask, const SDLoc &DL,
+                                   SelectionDAG &DAG) {
+  return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
+}
+
 // The Shuffle result is as follow:
 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
 // Each Zeroable's element correspond to a particular Mask's element.
@@ -14871,7 +14908,7 @@ static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
 
   int LHSMask[4] = {-1, -1, -1, -1};
   int RHSMask[4] = {-1, -1, -1, -1};
-  unsigned SHUFPMask = 0;
+  int SHUFPDMask[4] = {-1, -1, -1, -1};
 
   // As SHUFPD uses a single LHS/RHS element per lane, we can always
   // perform the shuffle once the lanes have been shuffled in place.
@@ -14882,13 +14919,13 @@ static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
     int LaneBase = i & ~1;
     auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
     LaneMask[LaneBase + (M & 1)] = M;
-    SHUFPMask |= (M & 1) << i;
+    SHUFPDMask[i] = M & 1;
   }
 
   SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
   SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
   return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
-                     DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
+                     getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
 }
 
 /// Lower a vector shuffle crossing multiple 128-bit lanes as
@@ -15800,9 +15837,9 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
 
   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
-  ShuffleImm = 0;
-  bool ShufpdMask = true;
-  bool CommutableMask = true;
+  bool IsSHUFPD = true;
+  bool IsCommutable = true;
+  SmallVector<int, 8> SHUFPDMask(NumElts, -1);
   for (int i = 0; i < NumElts; ++i) {
     if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
       continue;
@@ -15811,20 +15848,21 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
     int Val = (i & 6) + NumElts * (i & 1);
     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
     if (Mask[i] < Val || Mask[i] > Val + 1)
-      ShufpdMask = false;
+      IsSHUFPD = false;
     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
-      CommutableMask = false;
-    ShuffleImm |= (Mask[i] % 2) << i;
+      IsCommutable = false;
+    SHUFPDMask[i] = Mask[i] % 2;
   }
 
-  if (!ShufpdMask && !CommutableMask)
+  if (!IsSHUFPD && !IsCommutable)
     return false;
 
-  if (!ShufpdMask && CommutableMask)
+  if (!IsSHUFPD && IsCommutable)
     std::swap(V1, V2);
 
   ForceV1Zero = ZeroLane[0];
   ForceV2Zero = ZeroLane[1];
+  ShuffleImm = getSHUFPDImm(SHUFPDMask);
   return true;
 }
 
@@ -25270,13 +25308,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
     const Align StackAlign = TFI.getStackAlign();
     if (hasInlineStackProbe(MF)) {
-      MachineRegisterInfo &MRI = MF.getRegInfo();
-
-      const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
-      Register Vreg = MRI.createVirtualRegister(AddrRegClass);
-      Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
-      Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
-                           DAG.getRegister(Vreg, SPTy));
+      Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
+                           {Chain, Size});
+      Chain = Result.getValue(1);
     } else {
       SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
       Chain = SP.getValue(1);
@@ -25288,8 +25322,6 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
           DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
   } else if (SplitStack) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-
     if (Is64Bit) {
       // The 64 bit implementation of segmented stacks needs to clobber both r10
       // r11. This makes it impossible to use it along with nested parameters.
@@ -25301,11 +25333,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
       }
     }
 
-    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
-    Register Vreg = MRI.createVirtualRegister(AddrRegClass);
-    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
-    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
-                                DAG.getRegister(Vreg, SPTy));
+    Result =
+        DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
+    Chain = Result.getValue(1);
   } else {
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 3bf61f22c9f1f..8a764de561413 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2428,8 +2428,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   Ops.push_back(Callee);
 
   if (isTailCall)
-    Ops.push_back(
-        DAG.getSignedConstant(FPDiff, dl, MVT::i32, /*isTarget=*/true));
+    Ops.push_back(DAG.getSignedTargetConstant(FPDiff, dl, MVT::i32));
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 9480838e8a7bd..a05a3063cac55 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -427,24 +427,24 @@ multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
                             X86FoldableSchedWrite sched> {
 
   let Predicates = [HasVLX] in
-    defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
+    defm NAME # "32X4Z256" : vinsert_for_size<Opcode128,
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
                                  X86VectorVTInfo< 8, EltVT32, VR256X>,
                                  vinsert128_insert, sched>, EVEX_V256;
 
-  defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
+  defm NAME # "32X4Z" : vinsert_for_size<Opcode128,
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
                                  X86VectorVTInfo<16, EltVT32, VR512>,
                                  vinsert128_insert, sched>, EVEX_V512;
 
-  defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
+  defm NAME # "64X4Z" : vinsert_for_size<Opcode256,
                                  X86VectorVTInfo< 4, EltVT64, VR256X>,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
                                  vinsert256_insert, sched>, REX_W, EVEX_V512;
 
   // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasVLX, HasDQI] in
-    defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128,
+    defm NAME # "64X2Z256" : vinsert_for_size_split<Opcode128,
                                    X86VectorVTInfo< 2, EltVT64, VR128X>,
                                    X86VectorVTInfo< 4, EltVT64, VR256X>,
                                    null_frag, vinsert128_insert, sched>,
@@ -452,13 +452,13 @@ multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
 
   // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasDQI] in {
-    defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128,
+    defm NAME # "64X2Z" : vinsert_for_size_split<Opcode128,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
                                  null_frag, vinsert128_insert, sched>,
                                  REX_W, EVEX_V512;
 
-    defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256,
+    defm NAME # "32X8Z" : vinsert_for_size_split<Opcode256,
                                    X86VectorVTInfo< 8, EltVT32, VR256X>,
                                    X86VectorVTInfo<16, EltVT32, VR512>,
                                    null_frag, vinsert256_insert, sched>,
@@ -472,47 +472,47 @@ defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, WriteShuffle256>;
 
 // Codegen pattern with the alternative types,
 // Even with AVX512DQ we'll still use these for unmasked operations.
-defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+defm : vinsert_for_size_lowering<"VINSERTF32X4Z256", v2f64x_info, v4f64x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
-defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+defm : vinsert_for_size_lowering<"VINSERTI32X4Z256", v2i64x_info, v4i64x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 
-defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+defm : vinsert_for_size_lowering<"VINSERTF32X4Z", v2f64x_info, v8f64_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
-defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+defm : vinsert_for_size_lowering<"VINSERTI32X4Z", v2i64x_info, v8i64_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 
-defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+defm : vinsert_for_size_lowering<"VINSERTF64X4Z", v8f32x_info, v16f32_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
-defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+defm : vinsert_for_size_lowering<"VINSERTI64X4Z", v8i32x_info, v16i32_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 
 // Codegen pattern with the alternative types insert VEC128 into VEC256
-defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+defm : vinsert_for_size_lowering<"VINSERTI32X4Z256", v8i16x_info, v16i16x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
-defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+defm : vinsert_for_size_lowering<"VINSERTI32X4Z256", v16i8x_info, v32i8x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
-defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info,
+defm : vinsert_for_size_lowering<"VINSERTF32X4Z256", v8f16x_info, v16f16x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
-defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8bf16x_info, v16bf16x_info,
+defm : vinsert_for_size_lowering<"VINSERTF32X4Z256", v8bf16x_info, v16bf16x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 // Codegen pattern with the alternative types insert VEC128 into VEC512
-defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+defm : vinsert_for_size_lowering<"VINSERTI32X4Z", v8i16x_info, v32i16_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
-defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+defm : vinsert_for_size_lowering<"VINSERTI32X4Z", v16i8x_info, v64i8_info,
                vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
-defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info,
+defm : vinsert_for_size_lowering<"VINSERTF32X4Z", v8f16x_info, v32f16_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
-defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8bf16x_info, v32bf16_info,
+defm : vinsert_for_size_lowering<"VINSERTF32X4Z", v8bf16x_info, v32bf16_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 // Codegen pattern with the alternative types insert VEC256 into VEC512
-defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+defm : vinsert_for_size_lowering<"VINSERTI64X4Z", v16i16x_info, v32i16_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
-defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+defm : vinsert_for_size_lowering<"VINSERTI64X4Z", v32i8x_info, v64i8_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
-defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info,
+defm : vinsert_for_size_lowering<"VINSERTF64X4Z", v16f16x_info, v32f16_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
-defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16bf16x_info, v32bf16_info,
+defm : vinsert_for_size_lowering<"VINSERTF64X4Z", v16bf16x_info, v32bf16_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 
 
@@ -568,81 +568,81 @@ let Predicates = p in {
 }
 }
 
-defm : vinsert_for_mask_cast<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+defm : vinsert_for_mask_cast<"VINSERTF32X4Z256", v2f64x_info, v4f64x_info,
                              v8f32x_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasVLX]>;
-defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4f32x_info, v8f32x_info,
+defm : vinsert_for_mask_cast<"VINSERTF64X2Z256", v4f32x_info, v8f32x_info,
                              v4f64x_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
 
-defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X4Z256", v2i64x_info, v4i64x_info,
                              v8i32x_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasVLX]>;
-defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X4Z256", v8i16x_info, v16i16x_info,
                              v8i32x_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasVLX]>;
-defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X4Z256", v16i8x_info, v32i8x_info,
                              v8i32x_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasVLX]>;
-defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4i32x_info, v8i32x_info,
+defm : vinsert_for_mask_cast<"VINSERTF64X2Z256", v4i32x_info, v8i32x_info,
                              v4i64x_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
-defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v8i16x_info, v16i16x_info,
+defm : vinsert_for_mask_cast<"VINSERTF64X2Z256", v8i16x_info, v16i16x_info,
                              v4i64x_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
-defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v16i8x_info, v32i8x_info,
+defm : vinsert_for_mask_cast<"VINSERTF64X2Z256", v16i8x_info, v32i8x_info,
                              v4i64x_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
 
-defm : vinsert_for_mask_cast<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+defm : vinsert_for_mask_cast<"VINSERTF32X4Z", v2f64x_info, v8f64_info,
                              v16f32_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasAVX512]>;
-defm : vinsert_for_mask_cast<"VINSERTF64x2Z", v4f32x_info, v16f32_info,
+defm : vinsert_for_mask_cast<"VINSERTF64X2Z", v4f32x_info, v16f32_info,
                              v8f64_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasDQI]>;
 
-defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X4Z", v2i64x_info, v8i64_info,
                              v16i32_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasAVX512]>;
-defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X4Z", v8i16x_info, v32i16_info,
                              v16i32_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasAVX512]>;
-defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X4Z", v16i8x_info, v64i8_info,
                              v16i32_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasAVX512]>;
-defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v4i32x_info, v16i32_info,
+defm : vinsert_for_mask_cast<"VINSERTI64X2Z", v4i32x_info, v16i32_info,
                              v8i64_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasDQI]>;
-defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v8i16x_info, v32i16_info,
+defm : vinsert_for_mask_cast<"VINSERTI64X2Z", v8i16x_info, v32i16_info,
                              v8i64_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasDQI]>;
-defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v16i8x_info, v64i8_info,
+defm : vinsert_for_mask_cast<"VINSERTI64X2Z", v16i8x_info, v64i8_info,
                              v8i64_info, vinsert128_insert,
                              INSERT_get_vinsert128_imm, [HasDQI]>;
 
-defm : vinsert_for_mask_cast<"VINSERTF32x8Z", v4f64x_info, v8f64_info,
+defm : vinsert_for_mask_cast<"VINSERTF32X8Z", v4f64x_info, v8f64_info,
                              v16f32_info, vinsert256_insert,
                              INSERT_get_vinsert256_imm, [HasDQI]>;
-defm : vinsert_for_mask_cast<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+defm : vinsert_for_mask_cast<"VINSERTF64X4Z", v8f32x_info, v16f32_info,
                              v8f64_info, vinsert256_insert,
                              INSERT_get_vinsert256_imm, [HasAVX512]>;
 
-defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v4i64x_info, v8i64_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X8Z", v4i64x_info, v8i64_info,
                              v16i32_info, vinsert256_insert,
                              INSERT_get_vinsert256_imm, [HasDQI]>;
-defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v16i16x_info, v32i16_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X8Z", v16i16x_info, v32i16_info,
                              v16i32_info, vinsert256_insert,
                              INSERT_get_vinsert256_imm, [HasDQI]>;
-defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v32i8x_info, v64i8_info,
+defm : vinsert_for_mask_cast<"VINSERTI32X8Z", v32i8x_info, v64i8_info,
                              v16i32_info, vinsert256_insert,
                              INSERT_get_vinsert256_imm, [HasDQI]>;
-defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+defm : vinsert_for_mask_cast<"VINSERTI64X4Z", v8i32x_info, v16i32_info,
                              v8i64_info, vinsert256_insert,
                              INSERT_get_vinsert256_imm, [HasAVX512]>;
-defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+defm : vinsert_for_mask_cast<"VINSERTI64X4Z", v16i16x_info, v32i16_info,
                              v8i64_info, vinsert256_insert,
                              INSERT_get_vinsert256_imm, [HasAVX512]>;
-defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+defm : vinsert_for_mask_cast<"VINSERTI64X4Z", v32i8x_info, v64i8_info,
                              v8i64_info, vinsert256_insert,
                              INSERT_get_vinsert256_imm, [HasAVX512]>;
 
@@ -732,19 +732,19 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
                              ValueType EltVT64, int Opcode256,
                              SchedWrite SchedRR, SchedWrite SchedMR> {
   let Predicates = [HasAVX512] in {
-    defm NAME # "32x4Z" : vextract_for_size<Opcode128,
+    defm NAME # "32X4Z" : vextract_for_size<Opcode128,
                                    X86VectorVTInfo<16, EltVT32, VR512>,
                                    X86VectorVTInfo< 4, EltVT32, VR128X>,
                                    vextract128_extract, SchedRR, SchedMR>,
                                        EVEX_V512, EVEX_CD8<32, CD8VT4>;
-    defm NAME # "64x4Z" : vextract_for_size<Opcode256,
+    defm NAME # "64X4Z" : vextract_for_size<Opcode256,
                                    X86VectorVTInfo< 8, EltVT64, VR512>,
                                    X86VectorVTInfo< 4, EltVT64, VR256X>,
                                    vextract256_extract, SchedRR, SchedMR>,
                                        REX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
   }
   let Predicates = [HasVLX] in
-    defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
+    defm NAME # "32X4Z256" : vextract_for_size<Opcode128,
                                  X86VectorVTInfo< 8, EltVT32, VR256X>,
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
                                  vextract128_extract, SchedRR, SchedMR>,
@@ -752,7 +752,7 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
 
   // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasVLX, HasDQI] in
-    defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128,
+    defm NAME # "64X2Z256" : vextract_for_size_split<Opcode128,
                                  X86VectorVTInfo< 4, EltVT64, VR256X>,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
                                  null_frag, vextract128_extract, SchedRR, SchedMR>,
@@ -760,12 +760,12 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
 
   // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasDQI] in {
-    defm NAME # "64x2Z" : vextract_for_size_split<Opcode128,
+    defm NAME # "64X2Z" : vextract_for_size_split<Opcode128,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
                                  null_frag, vextract128_extract, SchedRR, SchedMR>,
                                      REX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
-    defm NAME # "32x8Z" : vextract_for_size_split<Opcode256,
+    defm NAME # "32X8Z" : vextract_for_size_split<Opcode256,
                                  X86VectorVTInfo<16, EltVT32, VR512>,
                                  X86VectorVTInfo< 8, EltVT32, VR256X>,
                                  null_frag, vextract256_extract, SchedRR, SchedMR>,
@@ -779,48 +779,48 @@ defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, WriteShuffle256, WriteV
 
 // extract_subvector codegen patterns with the alternative types.
 // Even with AVX512DQ we'll still use these for unmasked operations.
-defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF32X4Z", v8f64_info, v2f64x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
-defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI32X4Z", v8i64_info, v2i64x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 
-defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF64X4Z", v16f32_info, v8f32x_info,
           vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
-defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI64X4Z", v16i32_info, v8i32x_info,
           vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
-defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF32X4Z256", v4f64x_info, v2f64x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
-defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI32X4Z256", v4i64x_info, v2i64x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 
 // Codegen pattern with the alternative types extract VEC128 from VEC256
-defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI32X4Z256", v16i16x_info, v8i16x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
-defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI32X4Z256", v32i8x_info, v16i8x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
-defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF32X4Z256", v16f16x_info, v8f16x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
-defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16bf16x_info, v8bf16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF32X4Z256", v16bf16x_info, v8bf16x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 
 // Codegen pattern with the alternative types extract VEC128 from VEC512
-defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI32X4Z", v32i16_info, v8i16x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
-defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI32X4Z", v64i8_info, v16i8x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
-defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF32X4Z", v32f16_info, v8f16x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
-defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32bf16_info, v8bf16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF32X4Z", v32bf16_info, v8bf16x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 // Codegen pattern with the alternative types extract VEC256 from VEC512
-defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI64X4Z", v32i16_info, v16i16x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
-defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+defm : vextract_for_size_lowering<"VEXTRACTI64X4Z", v64i8_info, v32i8x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
-defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF64X4Z", v32f16_info, v16f16x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
-defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32bf16_info, v16bf16x_info,
+defm : vextract_for_size_lowering<"VEXTRACTF64X4Z", v32bf16_info, v16bf16x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
 
@@ -861,31 +861,31 @@ def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
 // smaller extract to enable EVEX->VEX.
 let Predicates = [HasVLX] in {
 def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
-          (v2i64 (VEXTRACTI32x4Z256rri
+          (v2i64 (VEXTRACTI32X4Z256rri
                   (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))),
-          (v2f64 (VEXTRACTF32x4Z256rri
+          (v2f64 (VEXTRACTF32X4Z256rri
                   (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))),
-          (v4i32 (VEXTRACTI32x4Z256rri
+          (v4i32 (VEXTRACTI32X4Z256rri
                   (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))),
-          (v4f32 (VEXTRACTF32x4Z256rri
+          (v4f32 (VEXTRACTF32X4Z256rri
                   (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
-          (v8i16 (VEXTRACTI32x4Z256rri
+          (v8i16 (VEXTRACTI32X4Z256rri
                   (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
-          (v8f16 (VEXTRACTF32x4Z256rri
+          (v8f16 (VEXTRACTF32X4Z256rri
                   (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
-          (v16i8 (VEXTRACTI32x4Z256rri
+          (v16i8 (VEXTRACTI32X4Z256rri
                   (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
                   (iPTR 1)))>;
 }
@@ -919,81 +919,81 @@ let Predicates = p in {
 }
 }
 
-defm : vextract_for_mask_cast<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+defm : vextract_for_mask_cast<"VEXTRACTF32X4Z256", v4f64x_info, v2f64x_info,
                               v4f32x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasVLX]>;
-defm : vextract_for_mask_cast<"VEXTRACTF64x2Z256", v8f32x_info, v4f32x_info,
+defm : vextract_for_mask_cast<"VEXTRACTF64X2Z256", v8f32x_info, v4f32x_info,
                               v2f64x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
 
-defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X4Z256", v4i64x_info, v2i64x_info,
                               v4i32x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasVLX]>;
-defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X4Z256", v16i16x_info, v8i16x_info,
                               v4i32x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasVLX]>;
-defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X4Z256", v32i8x_info, v16i8x_info,
                               v4i32x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasVLX]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v8i32x_info, v4i32x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X2Z256", v8i32x_info, v4i32x_info,
                               v2i64x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v16i16x_info, v8i16x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X2Z256", v16i16x_info, v8i16x_info,
                               v2i64x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v32i8x_info, v16i8x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X2Z256", v32i8x_info, v16i8x_info,
                               v2i64x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
 
-defm : vextract_for_mask_cast<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+defm : vextract_for_mask_cast<"VEXTRACTF32X4Z", v8f64_info, v2f64x_info,
                               v4f32x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasAVX512]>;
-defm : vextract_for_mask_cast<"VEXTRACTF64x2Z", v16f32_info, v4f32x_info,
+defm : vextract_for_mask_cast<"VEXTRACTF64X2Z", v16f32_info, v4f32x_info,
                               v2f64x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasDQI]>;
 
-defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X4Z", v8i64_info, v2i64x_info,
                               v4i32x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasAVX512]>;
-defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X4Z", v32i16_info, v8i16x_info,
                               v4i32x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasAVX512]>;
-defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X4Z", v64i8_info, v16i8x_info,
                               v4i32x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasAVX512]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v16i32_info, v4i32x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X2Z", v16i32_info, v4i32x_info,
                               v2i64x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasDQI]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v32i16_info, v8i16x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X2Z", v32i16_info, v8i16x_info,
                               v2i64x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasDQI]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v64i8_info, v16i8x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X2Z", v64i8_info, v16i8x_info,
                               v2i64x_info, vextract128_extract,
                               EXTRACT_get_vextract128_imm, [HasDQI]>;
 
-defm : vextract_for_mask_cast<"VEXTRACTF32x8Z", v8f64_info, v4f64x_info,
+defm : vextract_for_mask_cast<"VEXTRACTF32X8Z", v8f64_info, v4f64x_info,
                               v8f32x_info, vextract256_extract,
                               EXTRACT_get_vextract256_imm, [HasDQI]>;
-defm : vextract_for_mask_cast<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+defm : vextract_for_mask_cast<"VEXTRACTF64X4Z", v16f32_info, v8f32x_info,
                               v4f64x_info, vextract256_extract,
                               EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
-defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v8i64_info, v4i64x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X8Z", v8i64_info, v4i64x_info,
                               v8i32x_info, vextract256_extract,
                               EXTRACT_get_vextract256_imm, [HasDQI]>;
-defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v32i16_info, v16i16x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X8Z", v32i16_info, v16i16x_info,
                               v8i32x_info, vextract256_extract,
                               EXTRACT_get_vextract256_imm, [HasDQI]>;
-defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v64i8_info, v32i8x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI32X8Z", v64i8_info, v32i8x_info,
                               v8i32x_info, vextract256_extract,
                               EXTRACT_get_vextract256_imm, [HasDQI]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X4Z", v16i32_info, v8i32x_info,
                               v4i64x_info, vextract256_extract,
                               EXTRACT_get_vextract256_imm, [HasAVX512]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X4Z", v32i16_info, v16i16x_info,
                               v4i64x_info, vextract256_extract,
                               EXTRACT_get_vextract256_imm, [HasAVX512]>;
-defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+defm : vextract_for_mask_cast<"VEXTRACTI64X4Z", v64i8_info, v32i8x_info,
                               v4i64x_info, vextract256_extract,
                               EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
@@ -2190,7 +2190,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                               PatFrag Frag_su, X86FoldableSchedWrite sched,
                               X86VectorVTInfo _, string Name> :
            avx512_icmp_cc<opc, Suffix, Frag, Frag_su, sched, _, Name> {
-  def rmib : AVX512AIi8<opc, MRMSrcMem,
+  def rmbi : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
                                      u8imm:$cc),
              !strconcat("vpcmp", Suffix,
@@ -2201,7 +2201,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                                        (_.BroadcastLdFrag addr:$src2),
                                        cond)))]>,
              EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
-  def rmibk : AVX512AIi8<opc, MRMSrcMem,
+  def rmbik : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2, u8imm:$cc),
               !strconcat("vpcmp", Suffix,
@@ -2216,13 +2216,13 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
 
   def : Pat<(_.KVT (Frag:$cc (_.BroadcastLdFrag addr:$src2),
                     (_.VT _.RC:$src1), cond)),
-            (!cast<Instruction>(Name#_.ZSuffix#"rmib")
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbi")
              _.RC:$src1, addr:$src2, (X86pcmpm_imm_commute $cc))>;
 
   def : Pat<(and _.KRCWM:$mask,
                  (_.KVT (Frag_su:$cc (_.BroadcastLdFrag addr:$src2),
                                      (_.VT _.RC:$src1), cond))),
-            (!cast<Instruction>(Name#_.ZSuffix#"rmibk")
+            (!cast<Instruction>(Name#_.ZSuffix#"rmbik")
              _.KRCWM:$mask, _.RC:$src1, addr:$src2,
              (X86pcmpm_imm_commute $cc))>;
 }
@@ -2457,13 +2457,13 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                  Predicate prd> {
   let Predicates = [prd], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
-      def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+      def ri : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
                               (i32 timm:$src2)))]>,
                       Sched<[sched]>;
-      def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+      def rik : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr#_.Suffix#
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
@@ -2471,7 +2471,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                                       (X86Vfpclasss_su (_.VT _.RC:$src1),
                                       (i32 timm:$src2))))]>,
                       EVEX_K, Sched<[sched]>;
-    def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+    def mi : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr#_.Suffix#
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2479,7 +2479,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                           (X86Vfpclasss (_.ScalarIntMemFrags addr:$src1),
                                         (i32 timm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
-    def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+    def mik : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr#_.Suffix#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
@@ -2497,13 +2497,13 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                  string mem, list<Register> _Uses = [MXCSR]>{
   let ExeDomain = _.ExeDomain, Uses = _Uses in {
-  def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+  def ri : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
                                        (i32 timm:$src2)))]>,
                       Sched<[sched]>;
-  def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+  def rik : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr#_.Suffix#
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
@@ -2511,7 +2511,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                                        (X86Vfpclass_su (_.VT _.RC:$src1),
                                        (i32 timm:$src2))))]>,
                       EVEX_K, Sched<[sched]>;
-  def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+  def mi : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.MemOp:$src1, i32u8imm:$src2),
                     OpcodeStr#_.Suffix#"{"#mem#"}"#
                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2519,7 +2519,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                                      (_.VT (_.LdFrag addr:$src1)),
                                      (i32 timm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
-  def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+  def mik : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
                     OpcodeStr#_.Suffix#"{"#mem#"}"#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
@@ -2527,7 +2527,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                                   (_.VT (_.LdFrag addr:$src1)),
                                   (i32 timm:$src2))))]>,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
-  def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+  def mbi : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr#_.Suffix#"\t{$src2, ${src1}"#
                                       _.BroadcastStr#", $dst|$dst, ${src1}"
@@ -2536,7 +2536,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                                      (_.VT (_.BroadcastLdFrag addr:$src1)),
                                      (i32 timm:$src2)))]>,
                     EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
-  def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+  def mbik : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
                     OpcodeStr#_.Suffix#"\t{$src2, ${src1}"#
                           _.BroadcastStr#", $dst {${mask}}|$dst {${mask}}, ${src1}"#
@@ -2551,21 +2551,21 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
   // the memory form.
   def : InstAlias<OpcodeStr#_.Suffix#mem#
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                  (!cast<Instruction>(NAME#"rr")
+                  (!cast<Instruction>(NAME#"ri")
                    _.KRC:$dst, _.RC:$src1, i32u8imm:$src2), 0, "att">;
   def : InstAlias<OpcodeStr#_.Suffix#mem#
                   "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                  (!cast<Instruction>(NAME#"rrk")
+                  (!cast<Instruction>(NAME#"rik")
                    _.KRC:$dst, _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), 0, "att">;
   def : InstAlias<OpcodeStr#_.Suffix#mem#
                   "\t{$src2, ${src1}"#_.BroadcastStr#", $dst|$dst, ${src1}"#
                   _.BroadcastStr#", $src2}",
-                  (!cast<Instruction>(NAME#"rmb")
+                  (!cast<Instruction>(NAME#"mbi")
                    _.KRC:$dst, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">;
   def : InstAlias<OpcodeStr#_.Suffix#mem#
                   "\t{$src2, ${src1}"#_.BroadcastStr#", $dst {${mask}}|"
                   "$dst {${mask}}, ${src1}"#_.BroadcastStr#", $src2}",
-                  (!cast<Instruction>(NAME#"rmbk")
+                  (!cast<Instruction>(NAME#"mbik")
                    _.KRC:$dst, _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), 0, "att">;
 }
 
@@ -2619,9 +2619,9 @@ multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                           X86MemOperand x86memop, string Suffix = ""> {
   let isMoveReg = 1, hasSideEffects = 0, SchedRW = [WriteMove],
       explicitOpPrefix = !if(!eq(Suffix, ""), NoExplicitOpPrefix, ExplicitEVEX) in
-  def kk#Suffix : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
-                  Sched<[WriteMove]>;
+    def kk#Suffix : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                    Sched<[WriteMove]>;
   def km#Suffix : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set KRC:$dst, (vvt (load addr:$src)))]>,
@@ -2992,7 +2992,7 @@ multiclass axv512_icmp_packed_cc_rmb_no_vlx_lowering<PatFrag Frag, PatFrag Frag_
 def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
                                 (Narrow.BroadcastLdFrag addr:$src2), cond)),
           (COPY_TO_REGCLASS
-           (!cast<Instruction>(InstStr#"Zrmib")
+           (!cast<Instruction>(InstStr#"Zrmbi")
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
             addr:$src2, (X86pcmpm_imm $cc)), Narrow.KRC)>;
 
@@ -3001,7 +3001,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
                             (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
                                          (Narrow.BroadcastLdFrag addr:$src2),
                                          cond)))),
-          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
            (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
            addr:$src2, (X86pcmpm_imm $cc)), Narrow.KRC)>;
@@ -3011,7 +3011,7 @@ def : Pat<(Narrow.KVT (Frag:$cc (Narrow.BroadcastLdFrag addr:$src2),
                                 (Narrow.VT Narrow.RC:$src1),
                                 cond)),
           (COPY_TO_REGCLASS
-           (!cast<Instruction>(InstStr#"Zrmib")
+           (!cast<Instruction>(InstStr#"Zrmbi")
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
             addr:$src2, (X86pcmpm_imm_commute $cc)), Narrow.KRC)>;
 
@@ -3020,7 +3020,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
                             (Frag_su:$cc (Narrow.BroadcastLdFrag addr:$src2),
                                          (Narrow.VT Narrow.RC:$src1),
                                          cond)))),
-          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
            (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
            addr:$src2, (X86pcmpm_imm_commute $cc)), Narrow.KRC)>;
@@ -4549,20 +4549,20 @@ let Predicates = [HasAVX512] in {
 
 def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
                       (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
-                      [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.ZMM.RM]>,
+                      [], SSEPackedInt>, Sched<[SchedWriteVecMoveLSNT.ZMM.RM]>,
                       EVEX, T8, PD, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasVLX] in {
   def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
                        (ins i256mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
-                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.YMM.RM]>,
+                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLSNT.YMM.RM]>,
                        EVEX, T8, PD, EVEX_V256, EVEX_CD8<64, CD8VF>;
 
   def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
                       (ins i128mem:$src),
                       "vmovntdqa\t{$src, $dst|$dst, $src}",
-                      [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.XMM.RM]>,
+                      [], SSEPackedInt>, Sched<[SchedWriteVecMoveLSNT.XMM.RM]>,
                       EVEX, T8, PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 3af3aa838159d..e8a50227912d8 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -451,13 +451,10 @@ int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
     return -(I->getOperand(1).getImm());
   }
 
-  // Handle other opcodes we reasonably expect to see in call
-  // sequences. Note this may include spill/restore of FP/BP.
+  // Currently handle only PUSHes we can reasonably expect to see
+  // in call sequences
   switch (MI.getOpcode()) {
   default:
-    assert(!(MI.modifiesRegister(X86::RSP, &RI) ||
-             MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) &&
-           "Unhandled opcode in getSPAdjust");
     return 0;
   case X86::PUSH32r:
   case X86::PUSH32rmm:
@@ -469,30 +466,6 @@ int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
   case X86::PUSH64rmr:
   case X86::PUSH64i32:
     return 8;
-  case X86::POP32r:
-  case X86::POP32rmm:
-  case X86::POP32rmr:
-    return -4;
-  case X86::POP64r:
-  case X86::POP64rmm:
-  case X86::POP64rmr:
-    return -8;
-  // FIXME: (implement and) use isAddImmediate in the
-  // default case instead of the following ADD/SUB cases.
-  case X86::ADD32ri:
-  case X86::ADD32ri8:
-  case X86::ADD64ri32:
-    if (MI.getOperand(0).getReg() == X86::RSP &&
-        MI.getOperand(1).getReg() == X86::RSP)
-      return -MI.getOperand(2).getImm();
-    return 0;
-  case X86::SUB32ri:
-  case X86::SUB32ri8:
-  case X86::SUB64ri32:
-    if (MI.getOperand(0).getReg() == X86::RSP &&
-        MI.getOperand(1).getReg() == X86::RSP)
-      return MI.getOperand(2).getImm();
-    return 0;
   }
 }
 
@@ -6293,16 +6266,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
                            get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
   case X86::VMOVAPSZ128mr_NOVLX:
     return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
-                            get(X86::VEXTRACTF32x4Zmri), X86::sub_xmm);
+                            get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
   case X86::VMOVUPSZ128mr_NOVLX:
     return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
-                            get(X86::VEXTRACTF32x4Zmri), X86::sub_xmm);
+                            get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
   case X86::VMOVAPSZ256mr_NOVLX:
     return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
-                            get(X86::VEXTRACTF64x4Zmri), X86::sub_ymm);
+                            get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
   case X86::VMOVUPSZ256mr_NOVLX:
     return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
-                            get(X86::VEXTRACTF64x4Zmri), X86::sub_ymm);
+                            get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
   case X86::MOV32ri64: {
     Register Reg = MIB.getReg(0);
     Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
@@ -7797,8 +7770,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFIXUPIMMSSZrri:
     case X86::VFIXUPIMMSSZrrik:
     case X86::VFIXUPIMMSSZrrikz:
-    case X86::VFPCLASSSSZrr:
-    case X86::VFPCLASSSSZrrk:
+    case X86::VFPCLASSSSZri:
+    case X86::VFPCLASSSSZrik:
     case X86::VGETEXPSSZr:
     case X86::VGETEXPSSZrk:
     case X86::VGETEXPSSZrkz:
@@ -7966,8 +7939,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFIXUPIMMSDZrri:
     case X86::VFIXUPIMMSDZrrik:
     case X86::VFIXUPIMMSDZrrikz:
-    case X86::VFPCLASSSDZrr:
-    case X86::VFPCLASSSDZrrk:
+    case X86::VFPCLASSSDZri:
+    case X86::VFPCLASSSDZrik:
     case X86::VGETEXPSDZr:
     case X86::VGETEXPSDZrk:
     case X86::VGETEXPSDZrkz:
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
index aec8f3ee7484f..7b57f7c23bf4d 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -15,9 +15,7 @@ using namespace llvm;
 
 yaml::X86MachineFunctionInfo::X86MachineFunctionInfo(
     const llvm::X86MachineFunctionInfo &MFI)
-    : AMXProgModel(MFI.getAMXProgModel()),
-      FPClobberedByCall(MFI.getFPClobberedByCall()),
-      HasPushSequences(MFI.getHasPushSequences()) {}
+    : AMXProgModel(MFI.getAMXProgModel()) {}
 
 void yaml::X86MachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
   MappingTraits<X86MachineFunctionInfo>::mapping(YamlIO, *this);
@@ -33,8 +31,6 @@ MachineFunctionInfo *X86MachineFunctionInfo::clone(
 void X86MachineFunctionInfo::initializeBaseYamlFields(
     const yaml::X86MachineFunctionInfo &YamlMFI) {
   AMXProgModel = YamlMFI.AMXProgModel;
-  FPClobberedByCall = YamlMFI.FPClobberedByCall;
-  HasPushSequences = YamlMFI.HasPushSequences;
 }
 
 void X86MachineFunctionInfo::anchor() { }
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 6414e6f22500c..24371369d4a45 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -38,8 +38,6 @@ template <> struct ScalarEnumerationTraits<AMXProgModelEnum> {
 
 struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo {
   AMXProgModelEnum AMXProgModel;
-  bool FPClobberedByCall;
-  bool HasPushSequences;
 
   X86MachineFunctionInfo() = default;
   X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI);
@@ -51,8 +49,6 @@ struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo {
 template <> struct MappingTraits<X86MachineFunctionInfo> {
   static void mapping(IO &YamlIO, X86MachineFunctionInfo &MFI) {
     YamlIO.mapOptional("amxProgModel", MFI.AMXProgModel);
-    YamlIO.mapOptional("FPClobberedByCall", MFI.FPClobberedByCall, false);
-    YamlIO.mapOptional("hasPushSequences", MFI.HasPushSequences, false);
   }
 };
 } // end namespace yaml
diff --git a/llvm/lib/Target/X86/X86ReplaceableInstrs.def b/llvm/lib/Target/X86/X86ReplaceableInstrs.def
index 9deb7a8bdacb8..fe7295548fe45 100644
--- a/llvm/lib/Target/X86/X86ReplaceableInstrs.def
+++ b/llvm/lib/Target/X86/X86ReplaceableInstrs.def
@@ -110,30 +110,30 @@ ENTRY(VBROADCASTSDZ256rr, VBROADCASTSDZ256rr, VPBROADCASTQZ256rr)
 ENTRY(VBROADCASTSDZ256rm, VBROADCASTSDZ256rm, VPBROADCASTQZ256rm)
 ENTRY(VBROADCASTSDZrr, VBROADCASTSDZrr, VPBROADCASTQZrr)
 ENTRY(VBROADCASTSDZrm, VBROADCASTSDZrm, VPBROADCASTQZrm)
-ENTRY(VINSERTF32x4Zrri, VINSERTF32x4Zrri, VINSERTI32x4Zrri)
-ENTRY(VINSERTF32x4Zrmi, VINSERTF32x4Zrmi, VINSERTI32x4Zrmi)
-ENTRY(VINSERTF32x8Zrri, VINSERTF32x8Zrri, VINSERTI32x8Zrri)
-ENTRY(VINSERTF32x8Zrmi, VINSERTF32x8Zrmi, VINSERTI32x8Zrmi)
-ENTRY(VINSERTF64x2Zrri, VINSERTF64x2Zrri, VINSERTI64x2Zrri)
-ENTRY(VINSERTF64x2Zrmi, VINSERTF64x2Zrmi, VINSERTI64x2Zrmi)
-ENTRY(VINSERTF64x4Zrri, VINSERTF64x4Zrri, VINSERTI64x4Zrri)
-ENTRY(VINSERTF64x4Zrmi, VINSERTF64x4Zrmi, VINSERTI64x4Zrmi)
-ENTRY(VINSERTF32x4Z256rri, VINSERTF32x4Z256rri, VINSERTI32x4Z256rri)
-ENTRY(VINSERTF32x4Z256rmi, VINSERTF32x4Z256rmi, VINSERTI32x4Z256rmi)
-ENTRY(VINSERTF64x2Z256rri, VINSERTF64x2Z256rri, VINSERTI64x2Z256rri)
-ENTRY(VINSERTF64x2Z256rmi, VINSERTF64x2Z256rmi, VINSERTI64x2Z256rmi)
-ENTRY(VEXTRACTF32x4Zrri, VEXTRACTF32x4Zrri, VEXTRACTI32x4Zrri)
-ENTRY(VEXTRACTF32x4Zmri, VEXTRACTF32x4Zmri, VEXTRACTI32x4Zmri)
-ENTRY(VEXTRACTF32x8Zrri, VEXTRACTF32x8Zrri, VEXTRACTI32x8Zrri)
-ENTRY(VEXTRACTF32x8Zmri, VEXTRACTF32x8Zmri, VEXTRACTI32x8Zmri)
-ENTRY(VEXTRACTF64x2Zrri, VEXTRACTF64x2Zrri, VEXTRACTI64x2Zrri)
-ENTRY(VEXTRACTF64x2Zmri, VEXTRACTF64x2Zmri, VEXTRACTI64x2Zmri)
-ENTRY(VEXTRACTF64x4Zrri, VEXTRACTF64x4Zrri, VEXTRACTI64x4Zrri)
-ENTRY(VEXTRACTF64x4Zmri, VEXTRACTF64x4Zmri, VEXTRACTI64x4Zmri)
-ENTRY(VEXTRACTF32x4Z256rri, VEXTRACTF32x4Z256rri, VEXTRACTI32x4Z256rri)
-ENTRY(VEXTRACTF32x4Z256mri, VEXTRACTF32x4Z256mri, VEXTRACTI32x4Z256mri)
-ENTRY(VEXTRACTF64x2Z256rri, VEXTRACTF64x2Z256rri, VEXTRACTI64x2Z256rri)
-ENTRY(VEXTRACTF64x2Z256mri, VEXTRACTF64x2Z256mri, VEXTRACTI64x2Z256mri)
+ENTRY(VINSERTF32X4Zrri, VINSERTF32X4Zrri, VINSERTI32X4Zrri)
+ENTRY(VINSERTF32X4Zrmi, VINSERTF32X4Zrmi, VINSERTI32X4Zrmi)
+ENTRY(VINSERTF32X8Zrri, VINSERTF32X8Zrri, VINSERTI32X8Zrri)
+ENTRY(VINSERTF32X8Zrmi, VINSERTF32X8Zrmi, VINSERTI32X8Zrmi)
+ENTRY(VINSERTF64X2Zrri, VINSERTF64X2Zrri, VINSERTI64X2Zrri)
+ENTRY(VINSERTF64X2Zrmi, VINSERTF64X2Zrmi, VINSERTI64X2Zrmi)
+ENTRY(VINSERTF64X4Zrri, VINSERTF64X4Zrri, VINSERTI64X4Zrri)
+ENTRY(VINSERTF64X4Zrmi, VINSERTF64X4Zrmi, VINSERTI64X4Zrmi)
+ENTRY(VINSERTF32X4Z256rri, VINSERTF32X4Z256rri, VINSERTI32X4Z256rri)
+ENTRY(VINSERTF32X4Z256rmi, VINSERTF32X4Z256rmi, VINSERTI32X4Z256rmi)
+ENTRY(VINSERTF64X2Z256rri, VINSERTF64X2Z256rri, VINSERTI64X2Z256rri)
+ENTRY(VINSERTF64X2Z256rmi, VINSERTF64X2Z256rmi, VINSERTI64X2Z256rmi)
+ENTRY(VEXTRACTF32X4Zrri, VEXTRACTF32X4Zrri, VEXTRACTI32X4Zrri)
+ENTRY(VEXTRACTF32X4Zmri, VEXTRACTF32X4Zmri, VEXTRACTI32X4Zmri)
+ENTRY(VEXTRACTF32X8Zrri, VEXTRACTF32X8Zrri, VEXTRACTI32X8Zrri)
+ENTRY(VEXTRACTF32X8Zmri, VEXTRACTF32X8Zmri, VEXTRACTI32X8Zmri)
+ENTRY(VEXTRACTF64X2Zrri, VEXTRACTF64X2Zrri, VEXTRACTI64X2Zrri)
+ENTRY(VEXTRACTF64X2Zmri, VEXTRACTF64X2Zmri, VEXTRACTI64X2Zmri)
+ENTRY(VEXTRACTF64X4Zrri, VEXTRACTF64X4Zrri, VEXTRACTI64X4Zrri)
+ENTRY(VEXTRACTF64X4Zmri, VEXTRACTF64X4Zmri, VEXTRACTI64X4Zmri)
+ENTRY(VEXTRACTF32X4Z256rri, VEXTRACTF32X4Z256rri, VEXTRACTI32X4Z256rri)
+ENTRY(VEXTRACTF32X4Z256mri, VEXTRACTF32X4Z256mri, VEXTRACTI32X4Z256mri)
+ENTRY(VEXTRACTF64X2Z256rri, VEXTRACTF64X2Z256rri, VEXTRACTI64X2Z256rri)
+ENTRY(VEXTRACTF64X2Z256mri, VEXTRACTF64X2Z256mri, VEXTRACTI64X2Z256mri)
 ENTRY(VPERMILPSmi, VPERMILPSmi, VPSHUFDmi)
 ENTRY(VPERMILPSri, VPERMILPSri, VPSHUFDri)
 ENTRY(VPERMILPSZ128mi, VPERMILPSZ128mi, VPSHUFDZ128mi)
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index a5051d932d4e2..9d6368f19607b 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -350,8 +350,8 @@ defm : ICXWriteResPair<WriteFVarBlendZ,[ICXPort015], 2, [2], 2, 7>;
 defm : X86WriteRes<WriteVecLoad,         [ICXPort23], 5, [1], 1>;
 defm : X86WriteRes<WriteVecLoadX,        [ICXPort23], 6, [1], 1>;
 defm : X86WriteRes<WriteVecLoadY,        [ICXPort23], 7, [1], 1>;
-defm : X86WriteRes<WriteVecLoadNT,       [ICXPort23], 6, [1], 1>;
-defm : X86WriteRes<WriteVecLoadNTY,      [ICXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [ICXPort23,ICXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecLoadNTY,      [ICXPort23,ICXPort015], 8, [1,1], 2>;
 defm : X86WriteRes<WriteVecMaskedLoad,   [ICXPort23,ICXPort015], 7, [1,1], 2>;
 defm : X86WriteRes<WriteVecMaskedLoadY,  [ICXPort23,ICXPort015], 8, [1,1], 2>;
 defm : X86WriteRes<WriteVecStore,        [ICXPort78,ICXPort49], 1, [1,1], 2>;
@@ -392,7 +392,7 @@ defm : ICXWriteResPair<WriteShuffle,  [ICXPort5],  1, [1], 1, 5>; // Vector shuf
 defm : ICXWriteResPair<WriteShuffleX, [ICXPort15], 1, [1], 1, 6>;
 defm : ICXWriteResPair<WriteShuffleY, [ICXPort15], 1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteShuffleZ, [ICXPort5],  1, [1], 1, 7>;
-defm : ICXWriteResPair<WriteVarShuffle,  [ICXPort5],  1, [1], 1, 5>; // Vector variable shuffles.
+defm : ICXWriteResPair<WriteVarShuffle,  [ICXPort0,ICXPort5],  1, [1,1], 2, 5>; // Vector variable shuffles.
 defm : ICXWriteResPair<WriteVarShuffleX, [ICXPort15], 1, [1], 1, 6>;
 defm : ICXWriteResPair<WriteVarShuffleY, [ICXPort15], 1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteVarShuffleZ, [ICXPort5],  1, [1], 1, 7>;
@@ -861,8 +861,8 @@ def: InstRW<[ICXWriteResGroup33], (instregex "KADD(B|D|Q|W)kk",
                                              "VCMPPD(Z|Z128|Z256)rri",
                                              "VCMPPS(Z|Z128|Z256)rri",
                                              "VCMP(SD|SS)Zrr",
-                                             "VFPCLASS(PD|PS)(Z|Z128|Z256)rr",
-                                             "VFPCLASS(SD|SS)Zrr",
+                                             "VFPCLASS(PD|PS)(Z|Z128|Z256)ri",
+                                             "VFPCLASS(SD|SS)Zri",
                                              "VPCMPB(Z|Z128|Z256)rri",
                                              "VPCMPD(Z|Z128|Z256)rri",
                                              "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
@@ -1361,8 +1361,7 @@ def ICXWriteResGroup95 : SchedWriteRes<[ICXPort23,ICXPort015]> {
   let NumMicroOps = 2;
   let ReleaseAtCycles = [1,1];
 }
-def: InstRW<[ICXWriteResGroup95], (instrs VMOVNTDQAZ128rm,
-                                          VPBLENDDrmi)>;
+def: InstRW<[ICXWriteResGroup95], (instrs VPBLENDDrmi)>;
 def: InstRW<[ICXWriteResGroup95, ReadAfterVecXLd],
                                   (instregex "VBLENDMPDZ128rm(b?)",
                                              "VBLENDMPSZ128rm(b?)",
@@ -1568,8 +1567,7 @@ def ICXWriteResGroup121 : SchedWriteRes<[ICXPort23,ICXPort015]> {
   let NumMicroOps = 2;
   let ReleaseAtCycles = [1,1];
 }
-def: InstRW<[ICXWriteResGroup121], (instrs VMOVNTDQAZ256rm,
-                                           VPBLENDDYrmi)>;
+def: InstRW<[ICXWriteResGroup121], (instrs VPBLENDDYrmi)>;
 def: InstRW<[ICXWriteResGroup121, ReadAfterVecYLd],
                                    (instregex "VBLENDMPD(Z|Z256)rm(b?)",
                                               "VBLENDMPS(Z|Z256)rm(b?)",
@@ -1591,14 +1589,14 @@ def: InstRW<[ICXWriteResGroup121, ReadAfterVecYLd],
                                               "VBROADCASTI64X4Zrm(b?)",
                                               "VBROADCASTSD(Z|Z256)rm(b?)",
                                               "VBROADCASTSS(Z|Z256)rm(b?)",
-                                              "VINSERTF32x4(Z|Z256)rm(b?)",
-                                              "VINSERTF32x8Zrm(b?)",
-                                              "VINSERTF64x2(Z|Z256)rm(b?)",
-                                              "VINSERTF64x4Zrm(b?)",
-                                              "VINSERTI32x4(Z|Z256)rm(b?)",
-                                              "VINSERTI32x8Zrm(b?)",
-                                              "VINSERTI64x2(Z|Z256)rm(b?)",
-                                              "VINSERTI64x4Zrm(b?)",
+                                              "VINSERTF32X4(Z|Z256)rm(b?)",
+                                              "VINSERTF32X8Zrm(b?)",
+                                              "VINSERTF64X2(Z|Z256)rm(b?)",
+                                              "VINSERTF64X4Zrm(b?)",
+                                              "VINSERTI32X4(Z|Z256)rm(b?)",
+                                              "VINSERTI32X8Zrm(b?)",
+                                              "VINSERTI64X2(Z|Z256)rm(b?)",
+                                              "VINSERTI64X4Zrm(b?)",
                                               "VMOVAPD(Z|Z256)rm(b?)",
                                               "VMOVAPS(Z|Z256)rm(b?)",
                                               "VMOVDDUP(Z|Z256)rm(b?)",
@@ -1705,8 +1703,8 @@ def: InstRW<[ICXWriteResGroup136], (instrs VPMOVSXBWYrm,
                                            VPMOVSXWDYrm,
                                            VPMOVZXWDYrm)>;
 def: InstRW<[ICXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
-                                              "VFPCLASSSDZrm(b?)",
-                                              "VFPCLASSSSZrm(b?)",
+                                              "VFPCLASSSDZm(b?)i",
+                                              "VFPCLASSSSZm(b?)i",
                                               "(V?)PCMPGTQrm",
                                               "VPERMI2DZ128rm(b?)",
                                               "VPERMI2PDZ128rm(b?)",
@@ -1728,15 +1726,15 @@ def ICXWriteResGroup136_2 : SchedWriteRes<[ICXPort5,ICXPort23]> {
 }
 def: InstRW<[ICXWriteResGroup136_2], (instregex "VCMP(PD|PS)Z128rm(b?)i",
                                                 "VCMP(SD|SS)Zrm",
-                                                "VFPCLASSPDZ128rm(b?)",
-                                                "VFPCLASSPSZ128rm(b?)",
-                                                "VPCMPBZ128rmi(b?)",
-                                                "VPCMPDZ128rmi(b?)",
+                                                "VFPCLASSPDZ128m(b?)i",
+                                                "VFPCLASSPSZ128m(b?)i",
+                                                "VPCMPBZ128rm(b?)i",
+                                                "VPCMPDZ128rm(b?)i",
                                                 "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
                                                 "VPCMPGT(B|D|Q|W)Z128rm(b?)",
-                                                "VPCMPQZ128rmi(b?)",
-                                                "VPCMPU(B|D|Q|W)Z128rmi(b?)",
-                                                "VPCMPWZ128rmi(b?)",
+                                                "VPCMPQZ128rm(b?)i",
+                                                "VPCMPU(B|D|Q|W)Z128rm(b?)i",
+                                                "VPCMPWZ128rm(b?)i",
                                                 "(V?)PACK(U|S)S(DW|WB)(Z128)?rm",
                                                 "VPTESTMBZ128rm(b?)",
                                                 "VPTESTMDZ128rm(b?)",
@@ -1793,10 +1791,10 @@ def ICXWriteResGroup148_2 : SchedWriteRes<[ICXPort5,ICXPort23]> {
 }
 def: InstRW<[ICXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i",
                                                 "VCMPPS(Z|Z256)rm(b?)i",
-                                                "VFPCLASSPD(Z|Z256)rm(b?)",
-                                                "VFPCLASSPS(Z|Z256)rm(b?)",
-                                                "VPCMPB(Z|Z256)rmi(b?)",
-                                                "VPCMPD(Z|Z256)rmi(b?)",
+                                                "VFPCLASSPD(Z|Z256)m(b?)i",
+                                                "VFPCLASSPS(Z|Z256)m(b?)i",
+                                                "VPCMPB(Z|Z256)rm(b?)i",
+                                                "VPCMPD(Z|Z256)rm(b?)i",
                                                 "VPCMPEQB(Z|Z256)rm(b?)",
                                                 "VPCMPEQD(Z|Z256)rm(b?)",
                                                 "VPCMPEQQ(Z|Z256)rm(b?)",
@@ -1805,10 +1803,10 @@ def: InstRW<[ICXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i",
                                                 "VPCMPGTD(Z|Z256)rm(b?)",
                                                 "VPCMPGTQ(Z|Z256)rm(b?)",
                                                 "VPCMPGTW(Z|Z256)rm(b?)",
-                                                "VPCMPQ(Z|Z256)rmi(b?)",
-                                                "VPCMPU(B|D|Q|W)Z256rmi(b?)",
-                                                "VPCMPU(B|D|Q|W)Zrmi(b?)",
-                                                "VPCMPW(Z|Z256)rmi(b?)",
+                                                "VPCMPQ(Z|Z256)rm(b?)i",
+                                                "VPCMPU(B|D|Q|W)Z256rm(b?)i",
+                                                "VPCMPU(B|D|Q|W)Zrm(b?)i",
+                                                "VPCMPW(Z|Z256)rm(b?)i",
                                                 "(V?)PACK(U|S)S(DW|WB)(Y|Z|Z256)rm",
                                                 "VPTESTM(B|D|Q|W)Z256rm(b?)",
                                                 "VPTESTM(B|D|Q|W)Zrm(b?)",
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index 0545f9b7f4c00..4344a48a52628 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -632,14 +632,14 @@ def : InstRW<[SPRWriteResGroup10], (instregex "^ADD_F(32|64)m$",
                                               "^VPOPCNT(D|Q)Z128rmbk(z?)$")>;
 def : InstRW<[SPRWriteResGroup10, ReadAfterVecXLd], (instregex "^(V?)PACK(S|U)S(DW|WB)rm$",
                                                                "^(V?)PCMPGTQrm$",
-                                                               "^VFPCLASSP(D|H|S)Z128rmb$",
+                                                               "^VFPCLASSP(D|H|S)Z128mbi$",
                                                                "^VPACK(S|U)S(DW|WB)Z128rm$",
                                                                "^VPACK(S|U)SDWZ128rmb$",
                                                                "^VPM(AX|IN)(S|U)QZ128rm((b|k|bk|kz)?)$",
                                                                "^VPM(AX|IN)(S|U)QZ128rmbkz$",
                                                                "^VPMULTISHIFTQBZ128rm(b?)$")>;
-def : InstRW<[SPRWriteResGroup10, ReadAfterVecXLd], (instrs VFPCLASSPHZ128rm)>;
-def : InstRW<[SPRWriteResGroup10, ReadAfterVecYLd], (instregex "^VFPCLASSP(D|H|S)Z((256)?)rm$",
+def : InstRW<[SPRWriteResGroup10, ReadAfterVecXLd], (instrs VFPCLASSPHZ128mi)>;
+def : InstRW<[SPRWriteResGroup10, ReadAfterVecYLd], (instregex "^VFPCLASSP(D|H|S)Z((256)?)mi$",
                                                                "^VPERM(I|T)2(D|Q|PS)Z128rm((b|k|bk|kz)?)$",
                                                                "^VPERM(I|T)2(D|Q|PS)Z128rmbkz$",
                                                                "^VPERM(I|T)2PDZ128rm((b|k|bk|kz)?)$",
@@ -670,8 +670,8 @@ def : InstRW<[SPRWriteResGroup12], (instregex "^ADD_F(P?)rST0$",
                                               "^VCMPP(D|H|S)Z(128|256)rri(k?)$",
                                               "^VCMPS(D|H|S)Zrri$",
                                               "^VCMPS(D|H|S)Zrr(b?)i_Int(k?)$",
-                                              "^VFPCLASSP(D|H|S)Z(128|256)rr(k?)$",
-                                              "^VFPCLASSS(D|H|S)Zrr(k?)$",
+                                              "^VFPCLASSP(D|H|S)Z(128|256)ri(k?)$",
+                                              "^VFPCLASSS(D|H|S)Zri(k?)$",
                                               "^VPACK(S|U)S(DW|WB)Yrr$",
                                               "^VPACK(S|U)S(DW|WB)Z(128|256)rr$",
                                               "^VPALIGNRZ(128|256)rrik(z?)$",
@@ -1666,8 +1666,8 @@ def : InstRW<[SPRWriteResGroup131], (instregex "^VBROADCAST(F|I)32X(8|2)Zrmk(z?)
                                                "^VMOVDQ(A|U)(32|64)Zrmk(z?)$",
                                                "^VPBROADCAST(D|Q)Zrmk(z?)$")>;
 def : InstRW<[SPRWriteResGroup131, ReadAfterVecLd], (instregex "^MMX_P(ADD|SUB)(B|D|Q|W)rm$")>;
-def : InstRW<[SPRWriteResGroup131, ReadAfterVecYLd], (instregex "^VINSERT(F|I)(32|64)x4Zrmi((k|kz)?)$",
-                                                                "^VINSERT(F|I)(32x8|64x2)Zrmi((k|kz)?)$",
+def : InstRW<[SPRWriteResGroup131, ReadAfterVecYLd], (instregex "^VINSERT(F|I)(32|64)X4Zrmi((k|kz)?)$",
+                                                                "^VINSERT(F|I)(32X8|64X2)Zrmi((k|kz)?)$",
                                                                 "^VP(ADD|SUB)(B|D|Q|W)Zrm$",
                                                                 "^VP(ADD|SUB)(D|Q)Zrm(b|k|kz)$",
                                                                 "^VP(ADD|SUB)(D|Q)Zrmbk(z?)$",
@@ -1697,8 +1697,8 @@ def : InstRW<[SPRWriteResGroup134], (instregex "^VPBROADCAST(BY|WZ)rm$",
                                                "^VPBROADCAST(B|W)Z256rm$",
                                                "^VPBROADCAST(BZ|WY)rm$")>;
 def : InstRW<[SPRWriteResGroup134, ReadAfterLd], (instrs MMX_PINSRWrmi)>;
-def : InstRW<[SPRWriteResGroup134, ReadAfterVecXLd], (instregex "^VFPCLASSP(D|S)Z128rm$")>;
-def : InstRW<[SPRWriteResGroup134, ReadAfterVecLd], (instregex "^VFPCLASSS(D|H|S)Zrm$")>;
+def : InstRW<[SPRWriteResGroup134, ReadAfterVecXLd], (instregex "^VFPCLASSP(D|S)Z128mi$")>;
+def : InstRW<[SPRWriteResGroup134, ReadAfterVecLd], (instregex "^VFPCLASSS(D|H|S)Zmi$")>;
 def : InstRW<[SPRWriteResGroup134, ReadAfterVecYLd], (instregex "^VPALIGNR(Y|Z256)rmi$")>;
 def : InstRW<[SPRWriteResGroup134, ReadAfterVecYLd], (instrs VPSHUFBZrm)>;
 
@@ -2659,7 +2659,7 @@ def : InstRW<[SPRWriteResGroup258], (instregex "^VPBROADCAST(B|W)Z128rmk(z?)$",
 def : InstRW<[SPRWriteResGroup258, ReadAfterVecYLd], (instregex "^VALIGN(D|Q)Z((256)?)rm(bi|ik)$",
                                                                 "^VALIGN(D|Q)Z((256)?)rmbik(z?)$",
                                                                 "^VALIGN(D|Q)Z((256)?)rmi((kz)?)$",
-                                                                "^VFPCLASSP(D|H|S)Z((256)?)rmb$",
+                                                                "^VFPCLASSP(D|H|S)Z((256)?)mbi$",
                                                                 "^VPACK(S|U)S(DW|WB)(Y|Z)rm$",
                                                                 "^VPACK(S|U)S(DW|WB)Z256rm$",
                                                                 "^VPACK(S|U)SDWZ((256)?)rmb$",
@@ -2710,7 +2710,7 @@ def : InstRW<[SPRWriteResGroup262], (instregex "^VBROADCAST(F|I)32X(2|4)Z256rmk(
                                                "^VMOVDQ(A|U)(32|64)Z256rmk(z?)$",
                                                "^VPBROADCAST(D|Q)Z256rmk(z?)$")>;
 def : InstRW<[SPRWriteResGroup262, ReadAfterVecYLd], (instregex "^VINSERT(F|I)128rmi$",
-                                                                "^VINSERT(F|I)(32x4|64x2)Z256rmi((k|kz)?)$",
+                                                                "^VINSERT(F|I)(32X4|64X2)Z256rmi((k|kz)?)$",
                                                                 "^VP(ADD|SUB)(B|D|Q|W)(Y|Z256)rm$",
                                                                 "^VP(ADD|SUB)(D|Q)Z256rm(b|k|kz)$",
                                                                 "^VP(ADD|SUB)(D|Q)Z256rmbk(z?)$",
@@ -2724,29 +2724,29 @@ def SPRWriteResGroup263 : SchedWriteRes<[SPRPort02_03_11, SPRPort05]> {
 }
 def : InstRW<[SPRWriteResGroup263, ReadAfterVecXLd], (instregex "^VCMPP(D|H|S)Z128rm(bi|ik)$",
                                                                 "^VCMPP(D|H|S)Z128rm(i|bik)$",
-                                                                "^VFPCLASSP(D|H|S)Z128rm(b?)k$",
+                                                                "^VFPCLASSP(D|H|S)Z128m(b?)ik$",
                                                                 "^VPCMP(B|D|Q|W|UD|UQ|UW)Z128rmi(k?)$",
-                                                                "^VPCMP(D|Q|UQ)Z128rmib(k?)$",
+                                                                "^VPCMP(D|Q|UQ)Z128rmbi(k?)$",
                                                                 "^VPCMP(EQ|GT)(B|D|Q|W)Z128rm(k?)$",
                                                                 "^VPCMP(EQ|GT)(D|Q)Z128rmb(k?)$",
                                                                 "^VPCMPUBZ128rmi(k?)$",
-                                                                "^VPCMPUDZ128rmib(k?)$",
+                                                                "^VPCMPUDZ128rmbi(k?)$",
                                                                 "^VPTEST(N?)M(B|D|Q|W)Z128rm(k?)$",
                                                                 "^VPTEST(N?)M(D|Q)Z128rmb(k?)$")>;
 def : InstRW<[SPRWriteResGroup263, ReadAfterVecYLd], (instregex "^VCMPP(D|H|S)Z((256)?)rm(bi|ik)$",
                                                                 "^VCMPP(D|H|S)Z((256)?)rm(i|bik)$",
-                                                                "^VFPCLASSP(D|H|S)Z((256)?)rm(b?)k$",
+                                                                "^VFPCLASSP(D|H|S)Z((256)?)m(b?)ik$",
                                                                 "^VPCMP(B|D|Q|W|UD|UQ|UW)Z((256)?)rmi(k?)$",
-                                                                "^VPCMP(D|Q|UQ)Z((256)?)rmib(k?)$",
+                                                                "^VPCMP(D|Q|UQ)Z((256)?)rmbi(k?)$",
                                                                 "^VPCMP(EQ|GT)(B|D|Q|W)Z((256)?)rm(k?)$",
                                                                 "^VPCMP(EQ|GT)(D|Q)Z((256)?)rmb(k?)$",
                                                                 "^VPCMPUBZ((256)?)rmi(k?)$",
-                                                                "^VPCMPUDZ((256)?)rmib(k?)$",
+                                                                "^VPCMPUDZ((256)?)rmbi(k?)$",
                                                                 "^VPTEST(N?)M(B|D|Q|W)Z((256)?)rm(k?)$",
                                                                 "^VPTEST(N?)M(D|Q)Z((256)?)rmb(k?)$")>;
 def : InstRW<[SPRWriteResGroup263, ReadAfterVecLd], (instregex "^VCMPS(D|H|S)Zrmi$",
                                                                "^VCMPS(D|H|S)Zrmi_Int(k?)$",
-                                                               "^VFPCLASSS(D|H|S)Zrmk$")>;
+                                                               "^VFPCLASSS(D|H|S)Zmik$")>;
 
 def SPRWriteResGroup264 : SchedWriteRes<[SPRPort00, SPRPort02_03_11]> {
   let Latency = 10;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 116aa3555a065..e4e833c8f752c 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -345,8 +345,8 @@ defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
 defm : X86WriteRes<WriteVecLoad,         [SKLPort23], 5, [1], 1>;
 defm : X86WriteRes<WriteVecLoadX,        [SKLPort23], 6, [1], 1>;
 defm : X86WriteRes<WriteVecLoadY,        [SKLPort23], 7, [1], 1>;
-defm : X86WriteRes<WriteVecLoadNT,       [SKLPort23], 6, [1], 1>;
-defm : X86WriteRes<WriteVecLoadNTY,      [SKLPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [SKLPort23,SKLPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecLoadNTY,      [SKLPort23,SKLPort015], 8, [1,1], 2>;
 defm : X86WriteRes<WriteVecMaskedLoad,   [SKLPort23,SKLPort015], 7, [1,1], 2>;
 defm : X86WriteRes<WriteVecMaskedLoadY,  [SKLPort23,SKLPort015], 8, [1,1], 2>;
 defm : X86WriteRes<WriteVecStore,        [SKLPort237,SKLPort4], 1, [1,1], 2>;
@@ -387,7 +387,7 @@ defm : SKLWriteResPair<WriteShuffle,  [SKLPort5], 1, [1], 1, 5>; // Vector shuff
 defm : SKLWriteResPair<WriteShuffleX, [SKLPort5], 1, [1], 1, 6>;
 defm : SKLWriteResPair<WriteShuffleY, [SKLPort5], 1, [1], 1, 7>;
 defm : X86WriteResPairUnsupported<WriteShuffleZ>;
-defm : SKLWriteResPair<WriteVarShuffle,  [SKLPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : SKLWriteResPair<WriteVarShuffle,  [SKLPort0,SKLPort5], 1, [1,1], 2, 5>; // Vector shuffles.
 defm : SKLWriteResPair<WriteVarShuffleX, [SKLPort5], 1, [1], 1, 6>;
 defm : SKLWriteResPair<WriteVarShuffleY, [SKLPort5], 1, [1], 1, 7>;
 defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index e733d9ac74dd8..62cc4a9ea290c 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -345,8 +345,8 @@ defm : SKXWriteResPair<WriteFVarBlendZ,[SKXPort015], 2, [2], 2, 7>;
 defm : X86WriteRes<WriteVecLoad,         [SKXPort23], 5, [1], 1>;
 defm : X86WriteRes<WriteVecLoadX,        [SKXPort23], 6, [1], 1>;
 defm : X86WriteRes<WriteVecLoadY,        [SKXPort23], 7, [1], 1>;
-defm : X86WriteRes<WriteVecLoadNT,       [SKXPort23], 6, [1], 1>;
-defm : X86WriteRes<WriteVecLoadNTY,      [SKXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [SKXPort23,SKXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecLoadNTY,      [SKXPort23,SKXPort015], 8, [1,1], 2>;
 defm : X86WriteRes<WriteVecMaskedLoad,   [SKXPort23,SKXPort015], 7, [1,1], 2>;
 defm : X86WriteRes<WriteVecMaskedLoadY,  [SKXPort23,SKXPort015], 8, [1,1], 2>;
 defm : X86WriteRes<WriteVecStore,        [SKXPort237,SKXPort4], 1, [1,1], 2>;
@@ -387,7 +387,7 @@ defm : SKXWriteResPair<WriteShuffle,  [SKXPort5], 1, [1], 1, 5>; // Vector shuff
 defm : SKXWriteResPair<WriteShuffleX, [SKXPort5], 1, [1], 1, 6>;
 defm : SKXWriteResPair<WriteShuffleY, [SKXPort5], 1, [1], 1, 7>;
 defm : SKXWriteResPair<WriteShuffleZ, [SKXPort5], 1, [1], 1, 7>;
-defm : SKXWriteResPair<WriteVarShuffle,  [SKXPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : SKXWriteResPair<WriteVarShuffle,  [SKXPort0,SKXPort5], 1, [1,1], 2, 5>; // Vector variable shuffles.
 defm : SKXWriteResPair<WriteVarShuffleX, [SKXPort5], 1, [1], 1, 6>;
 defm : SKXWriteResPair<WriteVarShuffleY, [SKXPort5], 1, [1], 1, 7>;
 defm : SKXWriteResPair<WriteVarShuffleZ, [SKXPort5], 1, [1], 1, 7>;
@@ -846,8 +846,8 @@ def: InstRW<[SKXWriteResGroup33], (instregex "KADD(B|D|Q|W)kk",
                                              "VCMPPD(Z|Z128|Z256)rri",
                                              "VCMPPS(Z|Z128|Z256)rri",
                                              "VCMP(SD|SS)Zrr",
-                                             "VFPCLASS(PD|PS)(Z|Z128|Z256)rr",
-                                             "VFPCLASS(SD|SS)Zrr",
+                                             "VFPCLASS(PD|PS)(Z|Z128|Z256)ri",
+                                             "VFPCLASS(SD|SS)Zri",
                                              "VPCMPB(Z|Z128|Z256)rri",
                                              "VPCMPD(Z|Z128|Z256)rri",
                                              "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
@@ -1336,8 +1336,7 @@ def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let NumMicroOps = 2;
   let ReleaseAtCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup95], (instrs VMOVNTDQAZ128rm,
-                                          VPBLENDDrmi)>;
+def: InstRW<[SKXWriteResGroup95], (instrs VPBLENDDrmi)>;
 def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd],
                                   (instregex "VBLENDMPDZ128rm(b?)",
                                              "VBLENDMPSZ128rm(b?)",
@@ -1539,8 +1538,7 @@ def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let NumMicroOps = 2;
   let ReleaseAtCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup121], (instrs VMOVNTDQAZ256rm,
-                                           VPBLENDDYrmi)>;
+def: InstRW<[SKXWriteResGroup121], (instrs VPBLENDDYrmi)>;
 def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
                                    (instregex "VBLENDMPD(Z|Z256)rm(b?)",
                                               "VBLENDMPS(Z|Z256)rm(b?)",
@@ -1562,14 +1560,14 @@ def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
                                               "VBROADCASTI64X4Zrm(b?)",
                                               "VBROADCASTSD(Z|Z256)rm(b?)",
                                               "VBROADCASTSS(Z|Z256)rm(b?)",
-                                              "VINSERTF32x4(Z|Z256)rm(b?)",
-                                              "VINSERTF32x8Zrm(b?)",
-                                              "VINSERTF64x2(Z|Z256)rm(b?)",
-                                              "VINSERTF64x4Zrm(b?)",
-                                              "VINSERTI32x4(Z|Z256)rm(b?)",
-                                              "VINSERTI32x8Zrm(b?)",
-                                              "VINSERTI64x2(Z|Z256)rm(b?)",
-                                              "VINSERTI64x4Zrm(b?)",
+                                              "VINSERTF32X4(Z|Z256)rm(b?)",
+                                              "VINSERTF32X8Zrm(b?)",
+                                              "VINSERTF64X2(Z|Z256)rm(b?)",
+                                              "VINSERTF64X4Zrm(b?)",
+                                              "VINSERTI32X4(Z|Z256)rm(b?)",
+                                              "VINSERTI32X8Zrm(b?)",
+                                              "VINSERTI64X2(Z|Z256)rm(b?)",
+                                              "VINSERTI64X4Zrm(b?)",
                                               "VMOVAPD(Z|Z256)rm(b?)",
                                               "VMOVAPS(Z|Z256)rm(b?)",
                                               "VMOVDDUP(Z|Z256)rm(b?)",
@@ -1676,8 +1674,8 @@ def: InstRW<[SKXWriteResGroup136], (instrs VPMOVSXBWYrm,
                                            VPMOVSXWDYrm,
                                            VPMOVZXWDYrm)>;
 def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
-                                              "VFPCLASSSDZrm(b?)",
-                                              "VFPCLASSSSZrm(b?)",
+                                              "VFPCLASSSDZm(b?)i",
+                                              "VFPCLASSSSZm(b?)i",
                                               "(V?)PCMPGTQrm",
                                               "VPERMI2DZ128rm(b?)",
                                               "VPERMI2PDZ128rm(b?)",
@@ -1699,15 +1697,15 @@ def SKXWriteResGroup136_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup136_2], (instregex "VCMP(PD|PS)Z128rm(b?)i",
                                                 "VCMP(SD|SS)Zrm",
-                                                "VFPCLASSPDZ128rm(b?)",
-                                                "VFPCLASSPSZ128rm(b?)",
-                                                "VPCMPBZ128rmi(b?)",
-                                                "VPCMPDZ128rmi(b?)",
+                                                "VFPCLASSPDZ128m(b?)i",
+                                                "VFPCLASSPSZ128m(b?)i",
+                                                "VPCMPBZ128rm(b?)i",
+                                                "VPCMPDZ128rm(b?)i",
                                                 "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
                                                 "VPCMPGT(B|D|Q|W)Z128rm(b?)",
-                                                "VPCMPQZ128rmi(b?)",
-                                                "VPCMPU(B|D|Q|W)Z128rmi(b?)",
-                                                "VPCMPWZ128rmi(b?)",
+                                                "VPCMPQZ128rm(b?)i",
+                                                "VPCMPU(B|D|Q|W)Z128rm(b?)i",
+                                                "VPCMPWZ128rm(b?)i",
                                                 "VPTESTMBZ128rm(b?)",
                                                 "VPTESTMDZ128rm(b?)",
                                                 "VPTESTMQZ128rm(b?)",
@@ -1763,10 +1761,10 @@ def SKXWriteResGroup148_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i",
                                                 "VCMPPS(Z|Z256)rm(b?)i",
-                                                "VFPCLASSPD(Z|Z256)rm(b?)",
-                                                "VFPCLASSPS(Z|Z256)rm(b?)",
-                                                "VPCMPB(Z|Z256)rmi(b?)",
-                                                "VPCMPD(Z|Z256)rmi(b?)",
+                                                "VFPCLASSPD(Z|Z256)m(b?)i",
+                                                "VFPCLASSPS(Z|Z256)m(b?)i",
+                                                "VPCMPB(Z|Z256)rm(b?)i",
+                                                "VPCMPD(Z|Z256)rm(b?)i",
                                                 "VPCMPEQB(Z|Z256)rm(b?)",
                                                 "VPCMPEQD(Z|Z256)rm(b?)",
                                                 "VPCMPEQQ(Z|Z256)rm(b?)",
@@ -1775,10 +1773,10 @@ def: InstRW<[SKXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i",
                                                 "VPCMPGTD(Z|Z256)rm(b?)",
                                                 "VPCMPGTQ(Z|Z256)rm(b?)",
                                                 "VPCMPGTW(Z|Z256)rm(b?)",
-                                                "VPCMPQ(Z|Z256)rmi(b?)",
-                                                "VPCMPU(B|D|Q|W)Z256rmi(b?)",
-                                                "VPCMPU(B|D|Q|W)Zrmi(b?)",
-                                                "VPCMPW(Z|Z256)rmi(b?)",
+                                                "VPCMPQ(Z|Z256)rm(b?)i",
+                                                "VPCMPU(B|D|Q|W)Z256rm(b?)i",
+                                                "VPCMPU(B|D|Q|W)Zrm(b?)i",
+                                                "VPCMPW(Z|Z256)rm(b?)i",
                                                 "VPTESTM(B|D|Q|W)Z256rm(b?)",
                                                 "VPTESTM(B|D|Q|W)Zrm(b?)",
                                                 "VPTESTNM(B|D|Q|W)Z256rm(b?)",
diff --git a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index bab6769650a0c..931f5955955dc 100644
--- a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -168,47 +168,6 @@ void XCoreDAGToDAGISel::Select(SDNode *N) {
     }
     break;
   }
-  case XCoreISD::LADD: {
-    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        N->getOperand(2) };
-    ReplaceNode(N, CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32,
-                                          MVT::i32, Ops));
-    return;
-  }
-  case XCoreISD::LSUB: {
-    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        N->getOperand(2) };
-    ReplaceNode(N, CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32,
-                                          MVT::i32, Ops));
-    return;
-  }
-  case XCoreISD::MACCU: {
-    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                      N->getOperand(2), N->getOperand(3) };
-    ReplaceNode(N, CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32,
-                                          MVT::i32, Ops));
-    return;
-  }
-  case XCoreISD::MACCS: {
-    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                      N->getOperand(2), N->getOperand(3) };
-    ReplaceNode(N, CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32,
-                                          MVT::i32, Ops));
-    return;
-  }
-  case XCoreISD::LMUL: {
-    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                      N->getOperand(2), N->getOperand(3) };
-    ReplaceNode(N, CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32,
-                                          MVT::i32, Ops));
-    return;
-  }
-  case XCoreISD::CRC8: {
-    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
-    ReplaceNode(N, CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32,
-                                          MVT::i32, Ops));
-    return;
-  }
   case ISD::BRIND:
     if (tryBRIND(N))
       return;
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.td b/llvm/lib/Target/XCore/XCoreInstrInfo.td
index de1fb60a30f70..ca67eb044abd4 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.td
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.td
@@ -71,6 +71,41 @@ def SDT_XCoreLdwsp    : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
 def XCoreLdwsp        : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
                                [SDNPHasChain, SDNPMayLoad]>;
 
+def SDT_XCoreLAddSub : SDTypeProfile<2, 3, [
+  SDTCisVT<0, i32>, // result
+  SDTCisVT<1, i32>, // carry out
+  SDTCisVT<2, i32>, // lhs
+  SDTCisVT<3, i32>, // rhs
+  SDTCisVT<4, i32>  // carry in
+]>;
+
+def XCoreLAdd : SDNode<"XCoreISD::LADD", SDT_XCoreLAddSub>;
+def XCoreLSub : SDNode<"XCoreISD::LSUB", SDT_XCoreLAddSub>;
+
+// Used for both long multiplication and multiply-accumulate.
+def SDT_XCoreMul : SDTypeProfile<2, 4, [
+  SDTCisVT<0, i32>, // result (high part)
+  SDTCisVT<1, i32>, // result (low part)
+  SDTCisVT<2, i32>, // lhs
+  SDTCisVT<3, i32>, // rhs
+  SDTCisVT<4, i32>, // addend 1
+  SDTCisVT<5, i32>, // addend 2
+]>;
+
+def XCoreLMul : SDNode<"XCoreISD::LMUL", SDT_XCoreMul>;
+def XCoreMAccU : SDNode<"XCoreISD::MACCU", SDT_XCoreMul>;
+def XCoreMAccS : SDNode<"XCoreISD::MACCS", SDT_XCoreMul>;
+
+def XCoreCRC8 : SDNode<"XCoreISD::CRC8",
+  SDTypeProfile<2, 3, [
+    SDTCisVT<0, i32>, // shifted data
+    SDTCisVT<1, i32>, // result crc
+    SDTCisVT<2, i32>, // initial crc
+    SDTCisVT<3, i32>, // data
+    SDTCisVT<4, i32>, // polynomial
+  ]>
+>;
+
 // These are target-independent nodes, but have target-specific formats.
 def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
                                              SDTCisVT<1, i32> ]>;
@@ -485,28 +520,35 @@ def OUTPW_l2rus : _FL2RUSBitp<0b100101101, (outs),
 let Constraints = "$e = $a,$f = $b" in {
 def MACCU_l4r : _FL4RSrcDstSrcDst<
   0b000001, (outs GRRegs:$a, GRRegs:$b),
-  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccu $a, $b, $c, $d", []>;
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccu $a, $b, $c, $d",
+  [(set i32:$a, i32:$b, (XCoreMAccU i32:$e, i32:$f, i32:$c, i32:$d))]>;
 
 def MACCS_l4r : _FL4RSrcDstSrcDst<
   0b000010, (outs GRRegs:$a, GRRegs:$b),
-  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccs $a, $b, $c, $d", []>;
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccs $a, $b, $c, $d",
+  [(set i32:$a, i32:$b, (XCoreMAccS i32:$e, i32:$f, i32:$c, i32:$d))]>;
 }
 
 let Constraints = "$e = $b" in
 def CRC8_l4r : _FL4RSrcDst<0b000000, (outs GRRegs:$a, GRRegs:$b),
                            (ins GRRegs:$e, GRRegs:$c, GRRegs:$d),
-                           "crc8 $b, $a, $c, $d", []>;
+                           "crc8 $b, $a, $c, $d",
+                           [(set i32:$a, i32:$b,
+                                 (XCoreCRC8 i32:$e, i32:$c, i32:$d))]>;
 
 // Five operand long
 
 def LADD_l5r : _FL5R<0b000001, (outs GRRegs:$dst1, GRRegs:$dst2),
                      (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
                      "ladd $dst2, $dst1, $src1, $src2, $src3",
-                     []>;
+                     [(set i32:$dst1, i32:$dst2,
+                           (XCoreLAdd i32:$src1, i32:$src2, i32:$src3))]>;
 
 def LSUB_l5r : _FL5R<0b000010, (outs GRRegs:$dst1, GRRegs:$dst2),
                      (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
-                     "lsub $dst2, $dst1, $src1, $src2, $src3", []>;
+                     "lsub $dst2, $dst1, $src1, $src2, $src3",
+                     [(set i32:$dst1, i32:$dst2,
+                           (XCoreLSub i32:$src1, i32:$src2, i32:$src3))]>;
 
 def LDIVU_l5r : _FL5R<0b000000, (outs GRRegs:$dst1, GRRegs:$dst2),
                       (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
@@ -517,7 +559,9 @@ def LDIVU_l5r : _FL5R<0b000000, (outs GRRegs:$dst1, GRRegs:$dst2),
 def LMUL_l6r : _FL6R<
   0b00000, (outs GRRegs:$dst1, GRRegs:$dst2),
   (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3, GRRegs:$src4),
-  "lmul $dst1, $dst2, $src1, $src2, $src3, $src4", []>;
+  "lmul $dst1, $dst2, $src1, $src2, $src3, $src4",
+  [(set i32:$dst1, i32:$dst2,
+        (XCoreLMul i32:$src1, i32:$src2, i32:$src3, i32:$src4))]>;
 
 // Register - U6
 
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index 49a35bfcf4b9b..da3fbc04300e2 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
-#include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
 namespace RISCV {
diff --git a/llvm/lib/TargetParser/SubtargetFeature.cpp b/llvm/lib/TargetParser/SubtargetFeature.cpp
index 2c51c403c1934..be42a42967332 100644
--- a/llvm/lib/TargetParser/SubtargetFeature.cpp
+++ b/llvm/lib/TargetParser/SubtargetFeature.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
-#include <algorithm>
 #include <string>
 #include <vector>
 
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 7dfb8c021a8a5..b236e26f495df 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -107,6 +107,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
     {{"gfx940"},    {"gfx940"},  GK_GFX940,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
     {{"gfx941"},    {"gfx941"},  GK_GFX941,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
     {{"gfx942"},    {"gfx942"},  GK_GFX942,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
+    {{"gfx950"},    {"gfx950"},  GK_GFX950,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
     {{"gfx1010"},   {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
     {{"gfx1011"},   {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
     {{"gfx1012"},   {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
@@ -262,6 +263,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX940:  return {9, 4, 0};
   case GK_GFX941:  return {9, 4, 1};
   case GK_GFX942:  return {9, 4, 2};
+  case GK_GFX950:  return {9, 5, 0};
   case GK_GFX1010: return {10, 1, 0};
   case GK_GFX1011: return {10, 1, 1};
   case GK_GFX1012: return {10, 1, 2};
@@ -361,7 +363,8 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     Features["wavefrontsize32"] = true;
     Features["wavefrontsize64"] = true;
   } else if (T.isAMDGCN()) {
-    switch (parseArchAMDGCN(GPU)) {
+    AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU);
+    switch (Kind) {
     case GK_GFX1201:
     case GK_GFX1200:
     case GK_GFX12_GENERIC:
@@ -466,12 +469,17 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["s-memtime-inst"] = true;
       Features["gws"] = true;
       break;
+    case GK_GFX950:
+      Features["prng-inst"] = true;
+      Features["gfx950-insts"] = true;
+      [[fallthrough]];
     case GK_GFX942:
     case GK_GFX941:
     case GK_GFX940:
       Features["fp8-insts"] = true;
       Features["fp8-conversion-insts"] = true;
-      Features["xf32-insts"] = true;
+      if (Kind != GK_GFX950)
+        Features["xf32-insts"] = true;
       [[fallthrough]];
     case GK_GFX9_4_GENERIC:
       Features["gfx940-insts"] = true;
diff --git a/llvm/lib/TextAPI/InterfaceFile.cpp b/llvm/lib/TextAPI/InterfaceFile.cpp
index efbf33b74776e..ce2feb65c9ec9 100644
--- a/llvm/lib/TextAPI/InterfaceFile.cpp
+++ b/llvm/lib/TextAPI/InterfaceFile.cpp
@@ -172,6 +172,7 @@ InterfaceFile::merge(const InterfaceFile *O) const {
 
   IF->setTwoLevelNamespace(isTwoLevelNamespace());
   IF->setApplicationExtensionSafe(isApplicationExtensionSafe());
+  IF->setOSLibNotForSharedCache(isOSLibNotForSharedCache());
 
   for (const auto &It : umbrellas()) {
     if (!It.second.empty())
@@ -238,6 +239,8 @@ InterfaceFile::remove(Architecture Arch) const {
       return make_error<TextAPIError>(TextAPIErrorCode::NoSuchArchitecture);
   }
 
+  // FIXME: Figure out how to keep these attributes in sync when new ones are
+  // added.
   std::unique_ptr<InterfaceFile> IF(new InterfaceFile());
   IF->setFileType(getFileType());
   IF->setPath(getPath());
@@ -248,6 +251,7 @@ InterfaceFile::remove(Architecture Arch) const {
   IF->setSwiftABIVersion(getSwiftABIVersion());
   IF->setTwoLevelNamespace(isTwoLevelNamespace());
   IF->setApplicationExtensionSafe(isApplicationExtensionSafe());
+  IF->setOSLibNotForSharedCache(isOSLibNotForSharedCache());
   for (const auto &It : umbrellas())
     if (It.first.Arch != Arch)
       IF->addParentUmbrella(It.first, It.second);
@@ -316,6 +320,7 @@ InterfaceFile::extract(Architecture Arch) const {
   IF->setSwiftABIVersion(getSwiftABIVersion());
   IF->setTwoLevelNamespace(isTwoLevelNamespace());
   IF->setApplicationExtensionSafe(isApplicationExtensionSafe());
+  IF->setOSLibNotForSharedCache(isOSLibNotForSharedCache());
   for (const auto &It : umbrellas())
     if (It.first.Arch == Arch)
       IF->addParentUmbrella(It.first, It.second);
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 55951e54518bd..8a5bae9f6f0d4 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -628,11 +628,11 @@ void CoroCloner::replaceRetconOrAsyncSuspendUses() {
     return;
 
   // Otherwise, we need to create an aggregate.
-  Value *Agg = PoisonValue::get(NewS->getType());
-  for (auto Arg : llvm::enumerate(Args))
-    Agg = Builder.CreateInsertValue(Agg, Arg.value(), Arg.index());
+  Value *Aggr = PoisonValue::get(NewS->getType());
+  for (auto [Idx, Arg] : llvm::enumerate(Args))
+    Aggr = Builder.CreateInsertValue(Aggr, Arg, Idx);
 
-  NewS->replaceAllUsesWith(Agg);
+  NewS->replaceAllUsesWith(Aggr);
 }
 
 void CoroCloner::replaceCoroSuspends() {
@@ -1834,8 +1834,8 @@ void coro::AsyncABI::splitCoroutine(Function &F, coro::Shape &Shape,
 
   // Create a continuation function for each of the suspend points.
   Clones.reserve(Shape.CoroSuspends.size());
-  for (auto CS : llvm::enumerate(Shape.CoroSuspends)) {
-    auto *Suspend = cast<CoroSuspendAsyncInst>(CS.value());
+  for (auto [Idx, CS] : llvm::enumerate(Shape.CoroSuspends)) {
+    auto *Suspend = cast<CoroSuspendAsyncInst>(CS);
 
     // Create the clone declaration.
     auto ResumeNameSuffix = ".resume.";
@@ -1851,8 +1851,8 @@ void coro::AsyncABI::splitCoroutine(Function &F, coro::Shape &Shape,
     }
     auto *Continuation = createCloneDeclaration(
         F, Shape,
-        UseSwiftMangling ? ResumeNameSuffix + Twine(CS.index()) + "_"
-                         : ResumeNameSuffix + Twine(CS.index()),
+        UseSwiftMangling ? ResumeNameSuffix + Twine(Idx) + "_"
+                         : ResumeNameSuffix + Twine(Idx),
         NextF, Suspend);
     Clones.push_back(Continuation);
 
@@ -1885,12 +1885,12 @@ void coro::AsyncABI::splitCoroutine(Function &F, coro::Shape &Shape,
   }
 
   assert(Clones.size() == Shape.CoroSuspends.size());
-  for (auto CS : llvm::enumerate(Shape.CoroSuspends)) {
-    auto *Suspend = CS.value();
-    auto *Clone = Clones[CS.index()];
+  for (auto [Idx, CS] : llvm::enumerate(Shape.CoroSuspends)) {
+    auto *Suspend = CS;
+    auto *Clone = Clones[Idx];
 
-    CoroCloner::createClone(F, "resume." + Twine(CS.index()), Shape, Clone,
-                            Suspend, TTI);
+    CoroCloner::createClone(F, "resume." + Twine(Idx), Shape, Clone, Suspend,
+                            TTI);
   }
 }
 
@@ -1947,12 +1947,12 @@ void coro::AnyRetconABI::splitCoroutine(Function &F, coro::Shape &Shape,
 
   // Create a continuation function for each of the suspend points.
   Clones.reserve(Shape.CoroSuspends.size());
-  for (auto CS : llvm::enumerate(Shape.CoroSuspends)) {
-    auto Suspend = cast<CoroSuspendRetconInst>(CS.value());
+  for (auto [Idx, CS] : llvm::enumerate(Shape.CoroSuspends)) {
+    auto Suspend = cast<CoroSuspendRetconInst>(CS);
 
     // Create the clone declaration.
     auto Continuation = createCloneDeclaration(
-        F, Shape, ".resume." + Twine(CS.index()), NextF, nullptr);
+        F, Shape, ".resume." + Twine(Idx), NextF, nullptr);
     Clones.push_back(Continuation);
 
     // Insert a branch to the unified return block immediately before
@@ -2016,12 +2016,12 @@ void coro::AnyRetconABI::splitCoroutine(Function &F, coro::Shape &Shape,
   }
 
   assert(Clones.size() == Shape.CoroSuspends.size());
-  for (auto CS : llvm::enumerate(Shape.CoroSuspends)) {
-    auto Suspend = CS.value();
-    auto Clone = Clones[CS.index()];
+  for (auto [Idx, CS] : llvm::enumerate(Shape.CoroSuspends)) {
+    auto Suspend = CS;
+    auto Clone = Clones[Idx];
 
-    CoroCloner::createClone(F, "resume." + Twine(CS.index()), Shape, Clone,
-                            Suspend, TTI);
+    CoroCloner::createClone(F, "resume." + Twine(Idx), Shape, Clone, Suspend,
+                            TTI);
   }
 }
 
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index bfcf491f36ea1..87d2432803062 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1876,8 +1876,8 @@ bool LowerTypeTestsModule::runForTesting(Module &M, ModuleAnalysisManager &AM) {
   if (!ClReadSummary.empty()) {
     ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
                           ": ");
-    auto ReadSummaryFile =
-        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+    auto ReadSummaryFile = ExitOnErr(errorOrToExpected(
+        MemoryBuffer::getFile(ClReadSummary, /*IsText=*/true)));
 
     yaml::Input In(ReadSummaryFile->getBuffer());
     In >> Summary;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index a6d6ea573d802..46ce011c5f788 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1357,14 +1357,10 @@ Instruction *InstCombinerImpl::
   //   low bits to skip = shift bitwidth - high bits to extract
   // The shift amount itself may be extended, and we need to look past zero-ext
   // when matching NBits, that will matter for matching later.
-  Constant *C;
   Value *NBits;
-  if (!match(
-          LowBitsToSkip,
-          m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) ||
-      !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
-                                   APInt(C->getType()->getScalarSizeInBits(),
-                                         X->getType()->getScalarSizeInBits()))))
+  if (!match(LowBitsToSkip,
+             m_ZExtOrSelf(m_Sub(m_SpecificInt(XTy->getScalarSizeInBits()),
+                                m_ZExtOrSelf(m_Value(NBits))))))
     return nullptr;
 
   // Sign-extending value can be zero-extended if we `sub`tract it,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6cff3c7af91e3..42c0acd1e45ec 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -505,8 +505,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
 
   // If ctlz/cttz is only used as a shift amount, set is_zero_poison to true.
   if (II.hasOneUse() && match(Op1, m_Zero()) &&
-      match(II.user_back(), m_Shift(m_Value(), m_Specific(&II))))
+      match(II.user_back(), m_Shift(m_Value(), m_Specific(&II)))) {
+    II.dropUBImplyingAttrsAndMetadata();
     return IC.replaceOperand(II, 1, IC.Builder.getTrue());
+  }
 
   Constant *C;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index adbd9186c59c5..9588930d7658c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -750,7 +750,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                             Value *A, Value *B, Instruction &Outer,
                             SelectPatternFlavor SPF2, Value *C);
   Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
-  Instruction *foldSelectValueEquivalence(SelectInst &SI, ICmpInst &ICI);
+  Instruction *foldSelectValueEquivalence(SelectInst &SI, CmpInst &CI);
   bool replaceInInstruction(Value *V, Value *Old, Value *New,
                             unsigned Depth = 0);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 010b77548c152..2526ce7704ab1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1327,16 +1327,16 @@ bool InstCombinerImpl::replaceInInstruction(Value *V, Value *Old, Value *New,
 /// We can't replace %sel with %add unless we strip away the flags.
 /// TODO: Wrapping flags could be preserved in some cases with better analysis.
 Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
-                                                          ICmpInst &Cmp) {
-  if (!Cmp.isEquality())
-    return nullptr;
-
-  // Canonicalize the pattern to ICMP_EQ by swapping the select operands.
+                                                          CmpInst &Cmp) {
+  // Canonicalize the pattern to an equivalence on the predicate by swapping the
+  // select operands.
   Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
   bool Swapped = false;
-  if (Cmp.getPredicate() == ICmpInst::ICMP_NE) {
+  if (Cmp.isEquivalence(/*Invert=*/true)) {
     std::swap(TrueVal, FalseVal);
     Swapped = true;
+  } else if (!Cmp.isEquivalence()) {
+    return nullptr;
   }
 
   Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
@@ -1347,7 +1347,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
     // would lead to an infinite replacement cycle.
     // If we will be able to evaluate f(Y) to a constant, we can allow undef,
     // otherwise Y cannot be undef as we might pick different values for undef
-    // in the icmp and in f(Y).
+    // in the cmp and in f(Y).
     if (TrueVal == OldOp)
       return nullptr;
 
@@ -1901,9 +1901,6 @@ static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
                                                       ICmpInst *ICI) {
-  if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI))
-    return NewSel;
-
   if (Value *V =
           canonicalizeSPF(*ICI, SI.getTrueValue(), SI.getFalseValue(), *this))
     return replaceInstUsesWith(SI, V);
@@ -3469,7 +3466,8 @@ static bool isSafeToRemoveBitCeilSelect(ICmpInst::Predicate Pred, Value *Cond0,
 // Note that the select is optimized away while the shift count is masked with
 // 31.  We handle some variations of the input operand like std::bit_ceil(X +
 // 1).
-static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder) {
+static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder,
+                                InstCombinerImpl &IC) {
   Type *SelType = SI.getType();
   unsigned BitWidth = SelType->getScalarSizeInBits();
 
@@ -3504,6 +3502,10 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder) {
   // single hardware instruction as opposed to BitWidth - CTLZ, where BitWidth
   // is an integer constant.  Masking with BitWidth-1 comes free on some
   // hardware as part of the shift instruction.
+
+  // Drop range attributes and re-infer them in the next iteration.
+  cast<Instruction>(Ctlz)->dropPoisonGeneratingAnnotations();
+  IC.addToWorklist(cast<Instruction>(Ctlz));
   Value *Neg = Builder.CreateNeg(Ctlz);
   Value *Masked =
       Builder.CreateAnd(Neg, ConstantInt::get(SelType, BitWidth - 1));
@@ -3848,6 +3850,10 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     return Fabs;
 
   // See if we are selecting two values based on a comparison of the two values.
+  if (CmpInst *CI = dyn_cast<CmpInst>(CondVal))
+    if (Instruction *NewSel = foldSelectValueEquivalence(SI, *CI))
+      return NewSel;
+
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
     if (Instruction *Result = foldSelectInstWithICmp(SI, ICI))
       return Result;
@@ -4147,7 +4153,7 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (sinkNotIntoOtherHandOfLogicalOp(SI))
     return &SI;
 
-  if (Instruction *I = foldBitCeil(SI, Builder))
+  if (Instruction *I = foldBitCeil(SI, Builder, *this))
     return I;
 
   if (Instruction *I = foldSelectToCmp(SI))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 5eb807dcb76ce..392c5c78345c2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -997,7 +997,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
             uint64_t MaskedGEPIndex = HighBitsGEPIndex | MaskedLowBitsGEPIndex;
 
             if (MaskedGEPIndex != GEPIndex) {
-              auto *GEP = cast<GetElementPtrInst>(II->getArgOperand(0));
+              auto *GEP = cast<GEPOperator>(II->getArgOperand(0));
               Builder.SetInsertPoint(I);
               Type *GEPIndexType =
                   DL.getIndexType(GEP->getPointerOperand()->getType());
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index ede89b099e8de..f56414adbc3e7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -205,9 +205,9 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {
     if (IsBigEndian)
       ExtIndexC = NumElts.getKnownMinValue() - 1 - ExtIndexC;
     unsigned ShiftAmountC = ExtIndexC * DestWidth;
-    if (!ShiftAmountC ||
-        (isDesirableIntType(X->getType()->getPrimitiveSizeInBits()) &&
-        Ext.getVectorOperand()->hasOneUse())) {
+    if ((!ShiftAmountC ||
+         isDesirableIntType(X->getType()->getPrimitiveSizeInBits())) &&
+        Ext.getVectorOperand()->hasOneUse()) {
       if (ShiftAmountC)
         X = Builder.CreateLShr(X, ShiftAmountC, "extelt.offset");
       if (DestTy->isFloatingPointTy()) {
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index d59e0d26487d4..867158e782221 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LongestCommonSequence.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <map>
 #include <set>
@@ -856,6 +857,37 @@ memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI) {
   return Calls;
 }
 
+DenseMap<uint64_t, LocToLocMap>
+memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader,
+                           const TargetLibraryInfo &TLI) {
+  DenseMap<uint64_t, LocToLocMap> UndriftMaps;
+
+  DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromProfile =
+      MemProfReader->getMemProfCallerCalleePairs();
+  DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromIR =
+      extractCallsFromIR(M, TLI);
+
+  // Compute an undrift map for each CallerGUID.
+  for (const auto &[CallerGUID, IRAnchors] : CallsFromIR) {
+    auto It = CallsFromProfile.find(CallerGUID);
+    if (It == CallsFromProfile.end())
+      continue;
+    const auto &ProfileAnchors = It->second;
+
+    LocToLocMap Matchings;
+    longestCommonSequence<LineLocation, GlobalValue::GUID>(
+        ProfileAnchors, IRAnchors, std::equal_to<GlobalValue::GUID>(),
+        [&](LineLocation A, LineLocation B) { Matchings.try_emplace(A, B); });
+    bool Inserted = UndriftMaps.try_emplace(CallerGUID, Matchings).second;
+
+    // The insertion must succeed because we visit each GUID exactly once.
+    assert(Inserted);
+    (void)Inserted;
+  }
+
+  return UndriftMaps;
+}
+
 static void
 readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
             const TargetLibraryInfo &TLI,
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 64433cb9216c6..d03e3a0570cd3 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -767,7 +767,7 @@ ConstraintTy ConstraintInfo::getConstraintForSolving(CmpInst::Predicate Pred,
   if (CmpInst::isSigned(Pred) &&
       isKnownNonNegative(Op0, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1) &&
       isKnownNonNegative(Op1, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
-    Pred = CmpInst::getUnsignedPredicate(Pred);
+    Pred = ICmpInst::getUnsignedPredicate(Pred);
 
   SmallVector<Value *> NewVariables;
   ConstraintTy R = getConstraint(Pred, Op0, Op1, NewVariables);
@@ -857,7 +857,7 @@ void ConstraintInfo::transferToOtherSystem(
     if (IsKnownNonNegative(B)) {
       addFact(CmpInst::ICMP_SGE, A, ConstantInt::get(B->getType(), 0), NumIn,
               NumOut, DFSInStack);
-      addFact(CmpInst::getSignedPredicate(Pred), A, B, NumIn, NumOut,
+      addFact(ICmpInst::getSignedPredicate(Pred), A, B, NumIn, NumOut,
               DFSInStack);
     }
     break;
@@ -867,7 +867,7 @@ void ConstraintInfo::transferToOtherSystem(
     if (IsKnownNonNegative(A)) {
       addFact(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), 0), NumIn,
               NumOut, DFSInStack);
-      addFact(CmpInst::getSignedPredicate(Pred), A, B, NumIn, NumOut,
+      addFact(ICmpInst::getSignedPredicate(Pred), A, B, NumIn, NumOut,
               DFSInStack);
     }
     break;
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index b619248c59de0..5555b5e29cc74 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -883,7 +883,7 @@ ConstantRangeList getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args,
 struct DSEState {
   Function &F;
   AliasAnalysis &AA;
-  EarliestEscapeInfo EI;
+  EarliestEscapeAnalysis EA;
 
   /// The single BatchAA instance that is used to cache AA queries. It will
   /// not be invalidated over the whole run. This is safe, because:
@@ -943,7 +943,7 @@ struct DSEState {
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
            PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
            const LoopInfo &LI)
-      : F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
+      : F(F), AA(AA), EA(DT, &LI), BatchAA(AA, &EA), MSSA(MSSA), DT(DT),
         PDT(PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) {
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
@@ -1850,7 +1850,7 @@ struct DSEState {
             NowDeadInsts.push_back(OpI);
         }
 
-      EI.removeInstruction(DeadInst);
+      EA.removeInstruction(DeadInst);
       // Remove memory defs directly if they don't produce results, but only
       // queue other dead instructions for later removal. They may have been
       // used as memory locations that have been cached by BatchAA. Removing
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index fa04ced7182dc..94bfe44a847a3 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1465,8 +1465,11 @@ static Instruction *cloneInstructionInExitBlock(
 
   if (MSSAU.getMemorySSA()->getMemoryAccess(&I)) {
     // Create a new MemoryAccess and let MemorySSA set its defining access.
+    // After running some passes, MemorySSA might be outdated, and the
+    // instruction `I` may have become a non-memory touching instruction.
     MemoryAccess *NewMemAcc = MSSAU.createMemoryAccessInBB(
-        New, nullptr, New->getParent(), MemorySSA::Beginning);
+        New, nullptr, New->getParent(), MemorySSA::Beginning,
+        /*CreationMustSucceed=*/false);
     if (NewMemAcc) {
       if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
         MSSAU.insertDef(MemDef, /*RenameUses=*/true);
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index db63bda1e6b92..a0c0080c0bda1 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -71,7 +72,7 @@ static const unsigned MaxMemInstrCount = 100;
 // Maximum loop depth supported.
 static const unsigned MaxLoopNestDepth = 10;
 
-#ifdef DUMP_DEP_MATRICIES
+#ifndef NDEBUG
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
     for (auto D : Row)
@@ -110,6 +111,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
                     << " Loads and Stores to analyze\n");
 
   ValueVector::iterator I, IE, J, JE;
+  StringSet<> Seen;
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -156,7 +158,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
           Dep.push_back('I');
         }
 
-        DepMatrix.push_back(Dep);
+        // Make sure we only add unique entries to the dependency matrix.
+        if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
+          DepMatrix.push_back(Dep);
+
         if (DepMatrix.size() > MaxMemInstrCount) {
           LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
                             << " dependencies inside loop\n");
@@ -234,6 +239,14 @@ static void populateWorklist(Loop &L, LoopVector &LoopList) {
   LoopList.push_back(CurrentLoop);
 }
 
+static bool hasMinimumLoopDepth(SmallVectorImpl<Loop *> &LoopList) {
+  unsigned LoopNestDepth = LoopList.size();
+  if (LoopNestDepth < 2) {
+    LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+    return false;
+  }
+  return true;
+}
 namespace {
 
 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
@@ -416,11 +429,11 @@ struct LoopInterchange {
 
   bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
     bool Changed = false;
+
+    // Ensure minimum loop nest depth.
+    assert(hasMinimumLoopDepth(LoopList) && "Loop nest does not meet minimum depth.");
+
     unsigned LoopNestDepth = LoopList.size();
-    if (LoopNestDepth < 2) {
-      LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
-      return false;
-    }
     if (LoopNestDepth > MaxLoopNestDepth) {
       LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
                         << MaxLoopNestDepth << "\n");
@@ -441,10 +454,9 @@ struct LoopInterchange {
       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
       return false;
     }
-#ifdef DUMP_DEP_MATRICIES
-    LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
-    printDepMatrix(DependencyMatrix);
-#endif
+
+    LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
+               printDepMatrix(DependencyMatrix));
 
     // Get the Outermost loop exit.
     BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
@@ -484,10 +496,10 @@ struct LoopInterchange {
         std::swap(LoopList[i - 1], LoopList[i]);
         // Update the DependencyMatrix
         interChangeDependencies(DependencyMatrix, i, i - 1);
-#ifdef DUMP_DEP_MATRICIES
-        LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
-        printDepMatrix(DependencyMatrix);
-#endif
+
+        LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
+                   printDepMatrix(DependencyMatrix));
+
         ChangedPerIter |= Interchanged;
         Changed |= Interchanged;
       }
@@ -1712,6 +1724,10 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &U) {
   Function &F = *LN.getParent();
+  SmallVector<Loop *, 8> LoopList(LN.getLoops());
+  // Ensure minimum depth of the loop nest to do the interchange.
+  if (!hasMinimumLoopDepth(LoopList))
+    return PreservedAnalyses::all();
 
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
   std::unique_ptr<CacheCost> CC =
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index fb662daf03cbe..e9e1071ea210c 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -283,7 +283,7 @@ static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
 
 void MemCpyOptPass::eraseInstruction(Instruction *I) {
   MSSAU->removeMemoryAccess(I);
-  EEI->removeInstruction(I);
+  EEA->removeInstruction(I);
   I->eraseFromParent();
 }
 
@@ -638,7 +638,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent())
     return false;
 
-  BatchAAResults BAA(*AA, EEI);
+  BatchAAResults BAA(*AA, EEA);
   auto *T = LI->getType();
   // Don't introduce calls to memcpy/memmove intrinsics out of thin air if
   // the corresponding libcalls are not available.
@@ -1751,7 +1751,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         return true;
       }
 
-  BatchAAResults BAA(*AA, EEI);
+  BatchAAResults BAA(*AA, EEA);
   // FIXME: Not using getClobberingMemoryAccess() here due to PR54682.
   MemoryAccess *AnyClobber = MA->getDefiningAccess();
   MemoryLocation DestLoc = MemoryLocation::getForDest(M);
@@ -1876,7 +1876,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   if (!CallAccess)
     return false;
   MemCpyInst *MDep = nullptr;
-  BatchAAResults BAA(*AA, EEI);
+  BatchAAResults BAA(*AA, EEA);
   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
       CallAccess->getDefiningAccess(), Loc, BAA);
   if (auto *MD = dyn_cast<MemoryDef>(Clobber))
@@ -1949,7 +1949,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 /// 4. The memcpy src is not modified during the call. (ModRef check shows no
 /// Mod.)
 bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
-  BatchAAResults BAA(*AA, EEI);
+  BatchAAResults BAA(*AA, EEA);
   Value *ImmutArg = CB.getArgOperand(ArgNo);
 
   // 1. Ensure passed argument is immutable during call.
@@ -2117,8 +2117,8 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   MSSA = MSSA_;
   MemorySSAUpdater MSSAU_(MSSA_);
   MSSAU = &MSSAU_;
-  EarliestEscapeInfo EEI_(*DT);
-  EEI = &EEI_;
+  EarliestEscapeAnalysis EEA_(*DT);
+  EEA = &EEA_;
 
   while (true) {
     if (!iterateOnFunction(F))
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 4c8a5558b348c..cb6a4e34c226e 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -136,6 +137,26 @@ void llvm::CloneFunctionAttributesInto(Function *NewFunc,
                          OldAttrs.getRetAttrs(), NewArgAttrs));
 }
 
+DISubprogram *llvm::CollectDebugInfoForCloning(const Function &F,
+                                               CloneFunctionChangeType Changes,
+                                               DebugInfoFinder &DIFinder) {
+  DISubprogram *SPClonedWithinModule = nullptr;
+  if (Changes < CloneFunctionChangeType::DifferentModule) {
+    SPClonedWithinModule = F.getSubprogram();
+  }
+  if (SPClonedWithinModule)
+    DIFinder.processSubprogram(SPClonedWithinModule);
+
+  const Module *M = F.getParent();
+  if (Changes != CloneFunctionChangeType::ClonedModule && M) {
+    // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
+    for (const auto &I : instructions(F))
+      DIFinder.processInstruction(*M, I);
+  }
+
+  return SPClonedWithinModule;
+}
+
 // Clone OldFunc into NewFunc, transforming the old arguments into references to
 // VMap values.
 void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
@@ -168,23 +189,19 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   // duplicate instructions and then freeze them in the MD map. We also record
   // information about dbg.value and dbg.declare to avoid duplicating the
   // types.
-  std::optional<DebugInfoFinder> DIFinder;
+  DebugInfoFinder DIFinder;
 
   // Track the subprogram attachment that needs to be cloned to fine-tune the
   // mapping within the same module.
-  DISubprogram *SPClonedWithinModule = nullptr;
   if (Changes < CloneFunctionChangeType::DifferentModule) {
+    // Need to find subprograms, types, and compile units.
+
     assert((NewFunc->getParent() == nullptr ||
             NewFunc->getParent() == OldFunc->getParent()) &&
            "Expected NewFunc to have the same parent, or no parent");
-
-    // Need to find subprograms, types, and compile units.
-    DIFinder.emplace();
-
-    SPClonedWithinModule = OldFunc->getSubprogram();
-    if (SPClonedWithinModule)
-      DIFinder->processSubprogram(SPClonedWithinModule);
   } else {
+    // Need to find all the compile units.
+
     assert((NewFunc->getParent() == nullptr ||
             NewFunc->getParent() != OldFunc->getParent()) &&
            "Expected NewFunc to have different parents, or no parent");
@@ -192,20 +209,22 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     if (Changes == CloneFunctionChangeType::DifferentModule) {
       assert(NewFunc->getParent() &&
              "Need parent of new function to maintain debug info invariants");
-
-      // Need to find all the compile units.
-      DIFinder.emplace();
     }
   }
 
+  DISubprogram *SPClonedWithinModule =
+      CollectDebugInfoForCloning(*OldFunc, Changes, DIFinder);
+
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
   // recursive functions into themselves.
   for (const BasicBlock &BB : *OldFunc) {
 
     // Create a new basic block and copy instructions into it!
-    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
-                                      DIFinder ? &*DIFinder : nullptr);
+    // NOTE: don't pass DIFinder because instructions' debug info was processed
+    // in ProcessSubprogramAttachment. This will be cleaned up further.
+    BasicBlock *CBB =
+        CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo, nullptr);
 
     // Add basic block mapping.
     VMap[&BB] = CBB;
@@ -228,7 +247,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   }
 
   if (Changes < CloneFunctionChangeType::DifferentModule &&
-      DIFinder->subprogram_count() > 0) {
+      DIFinder.subprogram_count() > 0) {
     // Turn on module-level changes, since we need to clone (some of) the
     // debug info metadata.
     //
@@ -243,7 +262,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
     // Avoid cloning types, compile units, and (other) subprograms.
     SmallPtrSet<const DISubprogram *, 16> MappedToSelfSPs;
-    for (DISubprogram *ISP : DIFinder->subprograms()) {
+    for (DISubprogram *ISP : DIFinder.subprograms()) {
       if (ISP != SPClonedWithinModule) {
         mapToSelfIfNew(ISP);
         MappedToSelfSPs.insert(ISP);
@@ -251,16 +270,16 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     }
 
     // If a subprogram isn't going to be cloned skip its lexical blocks as well.
-    for (DIScope *S : DIFinder->scopes()) {
+    for (DIScope *S : DIFinder.scopes()) {
       auto *LScope = dyn_cast<DILocalScope>(S);
       if (LScope && MappedToSelfSPs.count(LScope->getSubprogram()))
         mapToSelfIfNew(S);
     }
 
-    for (DICompileUnit *CU : DIFinder->compile_units())
+    for (DICompileUnit *CU : DIFinder.compile_units())
       mapToSelfIfNew(CU);
 
-    for (DIType *Type : DIFinder->types())
+    for (DIType *Type : DIFinder.types())
       mapToSelfIfNew(Type);
   } else {
     assert(!SPClonedWithinModule &&
@@ -314,7 +333,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   SmallPtrSet<const void *, 8> Visited;
   for (auto *Operand : NMD->operands())
     Visited.insert(Operand);
-  for (auto *Unit : DIFinder->compile_units()) {
+  for (auto *Unit : DIFinder.compile_units()) {
     MDNode *MappedUnit =
         MapMetadata(Unit, VMap, RF_None, TypeMapper, Materializer);
     if (Visited.insert(MappedUnit).second)
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 509b6d6226551..42c258aa2c7b0 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3326,7 +3326,8 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         K->mergeDIAssignID(J);
         break;
       case LLVMContext::MD_tbaa:
-        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
+        if (DoesKMove)
+          K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
         break;
       case LLVMContext::MD_alias_scope:
         K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD));
@@ -3336,8 +3337,9 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
         break;
       case LLVMContext::MD_access_group:
-        K->setMetadata(LLVMContext::MD_access_group,
-                       intersectAccessGroups(K, J));
+        if (DoesKMove)
+          K->setMetadata(LLVMContext::MD_access_group,
+                         intersectAccessGroups(K, J));
         break;
       case LLVMContext::MD_range:
         if (DoesKMove || !K->hasMetadata(LLVMContext::MD_noundef))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1d9e4f5a19f5c..fda6550a37548 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -131,6 +131,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/NativeFormatting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
@@ -185,7 +186,7 @@ static cl::opt<unsigned> EpilogueVectorizationForceVF(
              "loops."));
 
 static cl::opt<unsigned> EpilogueVectorizationMinVF(
-    "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
+    "epilogue-vectorization-minimum-VF", cl::Hidden,
     cl::desc("Only loops with vectorization factor equal to or larger than "
              "the specified value are considered for epilogue vectorization."));
 
@@ -1519,7 +1520,7 @@ class LoopVectorizationCostModel {
   /// \p Multiplier is an aditional scaling factor applied to VF before
   /// comparing to EpilogueVectorizationMinVF.
   bool isEpilogueVectorizationProfitable(const ElementCount VF,
-                                         const unsigned Multiplier) const;
+                                         const unsigned IC) const;
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
@@ -4291,6 +4292,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
   return TTI.getVScaleForTuning();
 }
 
+/// This function attempts to return a value that represents the vectorization
+/// factor at runtime. For fixed-width VFs we know this precisely at compile
+/// time, but for scalable VFs we calculate it based on an estimate of the
+/// vscale value.
+static unsigned getEstimatedRuntimeVF(const Loop *L,
+                                      const TargetTransformInfo &TTI,
+                                      ElementCount VF) {
+  unsigned EstimatedVF = VF.getKnownMinValue();
+  if (VF.isScalable())
+    if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
+      EstimatedVF *= *VScale;
+  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
+  return EstimatedVF;
+}
+
 bool LoopVectorizationPlanner::isMoreProfitable(
     const VectorizationFactor &A, const VectorizationFactor &B,
     const unsigned MaxTripCount) const {
@@ -4593,17 +4609,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       InstructionCost C = CM.expectedCost(VF);
       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
 
-      unsigned AssumedMinimumVscale =
-          getVScaleForTuning(OrigLoop, TTI).value_or(1);
-      unsigned Width =
-          Candidate.Width.isScalable()
-              ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
-              : Candidate.Width.getFixedValue();
+      unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
       LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
                         << " costs: " << (Candidate.Cost / Width));
       if (VF.isScalable())
         LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
-                          << AssumedMinimumVscale << ")");
+                          << getVScaleForTuning(OrigLoop, TTI).value_or(1)
+                          << ")");
       LLVM_DEBUG(dbgs() << ".\n");
 
       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -4669,7 +4681,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
 }
 
 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
-    const ElementCount VF, const unsigned Multiplier) const {
+    const ElementCount VF, const unsigned IC) const {
   // FIXME: We need a much better cost-model to take different parameters such
   // as register pressure, code size increase and cost of extra branches into
   // account. For now we apply a very crude heuristic and only consider loops
@@ -4684,9 +4696,15 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
   if (TTI.getMaxInterleaveFactor(VF) <= 1)
     return false;
 
-  if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
-    return true;
-  return false;
+  // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
+  // VFs when deciding profitability.
+  // See related "TODO: extend to support scalable VFs." in
+  // selectEpilogueVectorizationFactor.
+  unsigned Multiplier = VF.isFixed() ? IC : 1;
+  unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
+                                ? EpilogueVectorizationMinVF
+                                : TTI.getEpilogueVectorizationMinVF();
+  return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
 }
 
 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4729,11 +4747,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     return Result;
   }
 
-  unsigned Multiplier = IC;
-  if (MainLoopVF.isScalable())
-    Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1);
-
-  if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) {
+  if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
                          "this loop\n");
     return Result;
@@ -4742,12 +4756,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
   // the main loop handles 8 lanes per iteration. We could still benefit from
   // vectorizing the epilogue loop with VF=4.
-  ElementCount EstimatedRuntimeVF = MainLoopVF;
-  if (MainLoopVF.isScalable()) {
-    EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
-    if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
-      EstimatedRuntimeVF *= *VScale;
-  }
+  ElementCount EstimatedRuntimeVF =
+      ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
 
   ScalarEvolution &SE = *PSE.getSE();
   Type *TCType = Legal->getWidestInductionType();
@@ -4987,13 +4997,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
   }
 
-  unsigned EstimatedVF = VF.getKnownMinValue();
-  if (VF.isScalable()) {
-    if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
-      EstimatedVF *= *VScale;
-  }
-  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
-
+  unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   if (KnownTC > 0) {
     // At least one iteration must be scalar when this constraint holds. So the
@@ -7424,7 +7428,17 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
 
   // Now compute and add the VPlan-based cost.
   Cost += Plan.cost(VF, CostCtx);
-  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+#ifndef NDEBUG
+  unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
+  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
+                    << " (Estimated cost per lane: ");
+  if (Cost.isValid()) {
+    double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
+    LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
+  } else /* No point dividing an invalid cost - it will still be invalid */
+    LLVM_DEBUG(dbgs() << "Invalid");
+  LLVM_DEBUG(dbgs() << ")\n");
+#endif
   return Cost;
 }
 
@@ -9797,8 +9811,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
 }
 
 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
-                                       VectorizationFactor &VF,
-                                       std::optional<unsigned> VScale, Loop *L,
+                                       VectorizationFactor &VF, Loop *L,
+                                       const TargetTransformInfo &TTI,
                                        PredicatedScalarEvolution &PSE,
                                        ScalarEpilogueLowering SEL) {
   InstructionCost CheckCost = Checks.getCost();
@@ -9850,13 +9864,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
   // the computations are performed on doubles, not integers and the result
   // is rounded up, hence we get an upper estimate of the TC.
-  unsigned IntVF = VF.Width.getKnownMinValue();
-  if (VF.Width.isScalable()) {
-    unsigned AssumedMinimumVscale = 1;
-    if (VScale)
-      AssumedMinimumVscale = *VScale;
-    IntVF *= AssumedMinimumVscale;
-  }
+  unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
   uint64_t RtC = *CheckCost.getValue();
   uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
   uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10105,8 +10113,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
-        !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
-                                    PSE, SEL)) {
+        !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 918d7663548f5..47dcde7d9d189 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6815,16 +6815,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
         // Check if it is profitable to try vectorizing gathered loads. It is
         // profitable if we have more than 3 consecutive loads or if we have
         // less but all users are vectorized or deleted.
-        bool AllowToVectorize =
-            NumElts >= 3 ||
-            any_of(ValueToGatherNodes.at(Slice.front()),
-                   [=](const TreeEntry *TE) {
-                     return TE->Scalars.size() == 2 &&
-                            ((TE->Scalars.front() == Slice.front() &&
-                              TE->Scalars.back() == Slice.back()) ||
-                             (TE->Scalars.front() == Slice.back() &&
-                              TE->Scalars.back() == Slice.front()));
-                   });
+        bool AllowToVectorize = false;
         // Check if it is profitable to vectorize 2-elements loads.
         if (NumElts == 2) {
           bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
@@ -6861,6 +6852,19 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
             return true;
           };
           AllowToVectorize = CheckIfAllowed(Slice);
+        } else {
+          AllowToVectorize =
+              (NumElts >= 3 ||
+               any_of(ValueToGatherNodes.at(Slice.front()),
+                      [=](const TreeEntry *TE) {
+                        return TE->Scalars.size() == 2 &&
+                               ((TE->Scalars.front() == Slice.front() &&
+                                 TE->Scalars.back() == Slice.back()) ||
+                                (TE->Scalars.front() == Slice.back() &&
+                                 TE->Scalars.back() == Slice.front()));
+                      })) &&
+              hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
+                                       Slice.size());
         }
         if (AllowToVectorize) {
           SmallVector<Value *> PointerOps;
@@ -6903,7 +6907,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
       }
       // Mark masked gathers candidates as vectorized, if any.
       for (unsigned Cnt : MaskedGatherVectorized) {
-        ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(Cnt, NumElts);
+        ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
+            Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
         ArrayRef<Value *> Values(
             reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
         Results.emplace_back(Values, LoadsState::ScatterVectorize);
@@ -10291,10 +10296,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             Idx = EMask[Idx];
         }
         CommonVF = E->Scalars.size();
-      } else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
-                 Factor && E->Scalars.size() != Mask.size() &&
+      } else if (unsigned Factor = E->getInterleaveFactor();
+                 Factor > 0 && E->Scalars.size() != Mask.size() &&
                  ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
-                                                               *Factor)) {
+                                                               Factor)) {
         // Deinterleaved nodes are free.
         std::iota(CommonMask.begin(), CommonMask.end(), 0);
       }
@@ -13609,7 +13614,10 @@ Value *BoUpSLP::gather(
     } else {
       Vec = CreateShuffle(Root, Vec, Mask);
       if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
-          OI && OI->hasNUses(0))
+          OI && OI->hasNUses(0) &&
+          none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+            return TE->VectorizedValue == OI;
+          }))
         eraseInstruction(OI);
     }
   }
@@ -15708,16 +15716,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
           return E->VectorizedValue;
         }
-        assert(isa<ShuffleVectorInst>(Src) &&
-               "Not supported shufflevector usage.");
-        auto *SVSrc = cast<ShuffleVectorInst>(Src);
-        assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
-               "Not supported shufflevector usage.");
         SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
-        SmallVector<int> NewMask(ThisMask.size());
-        transform(ThisMask, NewMask.begin(),
-                  [&SVSrc](int Mask) { return SVSrc->getShuffleMask()[Mask]; });
-        V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
+        if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
+          assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
+                 "Not supported shufflevector usage.");
+          SmallVector<int> NewMask(ThisMask.size());
+          transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
+            return SVSrc->getShuffleMask()[Mask];
+          });
+          V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
+        } else {
+          V = Builder.CreateShuffleVector(Src, ThisMask);
+        }
         propagateIRFlags(V, E->Scalars, VL0);
         if (auto *I = dyn_cast<Instruction>(V))
           V = propagateMetadata(I, E->Scalars);
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 6217c9fecf45d..4b0e12c28f07b 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/SandboxIR/Instruction.h"
 #include "llvm/SandboxIR/Utils.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h"
 
 namespace llvm::sandboxir {
 
@@ -58,6 +59,12 @@ bool PredIterator::operator==(const PredIterator &Other) const {
   return OpIt == Other.OpIt && MemIt == Other.MemIt;
 }
 
+DGNode::~DGNode() {
+  if (SB == nullptr)
+    return;
+  SB->eraseFromBundle(this);
+}
+
 #ifndef NDEBUG
 void DGNode::print(raw_ostream &OS, bool PrintDeps) const {
   OS << *I << " USuccs:" << UnscheduledSuccs << " Sched:" << Scheduled << "\n";
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
index 17544afcc185f..6ea34c5e0598d 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
@@ -140,8 +140,8 @@ LLVM_DUMP_METHOD void SeedContainer::dump() const { print(dbgs()); }
 #endif // NDEBUG
 
 template <typename LoadOrStoreT> static bool isValidMemSeed(LoadOrStoreT *LSI) {
-  if (LSI->isSimple())
-    return true;
+  if (!LSI->isSimple())
+    return false;
   auto *Ty = Utils::getExpectedType(LSI);
   // Omit types that are architecturally unvectorizable
   if (Ty->isX86_FP80Ty() || Ty->isPPC_FP128Ty())
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 574b7338f63fb..800ea223850d3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -747,38 +747,38 @@ define void @smax() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'smax'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t0 = call <2 x i8> @llvm.vp.smax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t2 = call <4 x i8> @llvm.vp.smax.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %t4 = call <8 x i8> @llvm.vp.smax.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %t6 = call <16 x i8> @llvm.vp.smax.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t8 = call <2 x i16> @llvm.vp.smax.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t10 = call <4 x i16> @llvm.vp.smax.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %t12 = call <8 x i16> @llvm.vp.smax.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t14 = call <16 x i16> @llvm.vp.smax.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t16 = call <2 x i32> @llvm.vp.smax.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t18 = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %t20 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t22 = call <16 x i32> @llvm.vp.smax.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t24 = call <2 x i64> @llvm.vp.smax.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %t26 = call <4 x i64> @llvm.vp.smax.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %t28 = call <8 x i64> @llvm.vp.smax.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t30 = call <16 x i64> @llvm.vp.smax.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.smax.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.smax.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.smax.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.smax.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.smax.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.smax.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.smax.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.smax.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.smax.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.smax.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.smax.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.smax.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.smax.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.smax.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.smax.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.smax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.smax.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.smax.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.smax.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.smax.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.smax.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.smax.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.smax.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.smax.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.smax.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.smax.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.smax.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.smax.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.smax.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.smax.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.smax.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.smax.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.smax.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.smax.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.smax.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.smax.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.smax.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.smax.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.smax.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.smax.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.smax.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.smax.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.smax.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.smax.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.smax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
@@ -854,38 +854,38 @@ define void @smin() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'smin'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t0 = call <2 x i8> @llvm.vp.smin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t2 = call <4 x i8> @llvm.vp.smin.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %t4 = call <8 x i8> @llvm.vp.smin.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %t6 = call <16 x i8> @llvm.vp.smin.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t8 = call <2 x i16> @llvm.vp.smin.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t10 = call <4 x i16> @llvm.vp.smin.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %t12 = call <8 x i16> @llvm.vp.smin.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t14 = call <16 x i16> @llvm.vp.smin.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t16 = call <2 x i32> @llvm.vp.smin.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t18 = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %t20 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t22 = call <16 x i32> @llvm.vp.smin.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t24 = call <2 x i64> @llvm.vp.smin.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %t26 = call <4 x i64> @llvm.vp.smin.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %t28 = call <8 x i64> @llvm.vp.smin.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t30 = call <16 x i64> @llvm.vp.smin.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.smin.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.smin.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.smin.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.smin.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.smin.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.smin.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.smin.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.smin.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.smin.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.smin.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.smin.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.smin.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.smin.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.smin.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.smin.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.smin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.smin.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.smin.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.smin.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.smin.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.smin.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.smin.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.smin.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.smin.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.smin.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.smin.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.smin.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.smin.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.smin.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.smin.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.smin.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.smin.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.smin.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.smin.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.smin.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.smin.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.smin.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.smin.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.smin.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.smin.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.smin.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.smin.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.smin.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.smin.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.smin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
@@ -960,38 +960,38 @@ define void @umax() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'umax'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t0 = call <2 x i8> @llvm.vp.umax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t2 = call <4 x i8> @llvm.vp.umax.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %t4 = call <8 x i8> @llvm.vp.umax.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %t6 = call <16 x i8> @llvm.vp.umax.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t8 = call <2 x i16> @llvm.vp.umax.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t10 = call <4 x i16> @llvm.vp.umax.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %t12 = call <8 x i16> @llvm.vp.umax.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t14 = call <16 x i16> @llvm.vp.umax.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t16 = call <2 x i32> @llvm.vp.umax.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t18 = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %t20 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t22 = call <16 x i32> @llvm.vp.umax.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t24 = call <2 x i64> @llvm.vp.umax.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %t26 = call <4 x i64> @llvm.vp.umax.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %t28 = call <8 x i64> @llvm.vp.umax.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t30 = call <16 x i64> @llvm.vp.umax.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.umax.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.umax.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.umax.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.umax.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.umax.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.umax.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.umax.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.umax.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.umax.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.umax.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.umax.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.umax.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.umax.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.umax.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.umax.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.umax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.umax.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.umax.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.umax.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.umax.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.umax.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.umax.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.umax.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.umax.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.umax.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.umax.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.umax.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.umax.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.umax.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.umax.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.umax.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.umax.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.umax.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.umax.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.umax.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.umax.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.umax.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.umax.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.umax.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.umax.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.umax.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.umax.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.umax.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.umax.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.umax.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
@@ -1066,38 +1066,38 @@ define void @umin() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'umin'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t0 = call <2 x i8> @llvm.vp.umin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t2 = call <4 x i8> @llvm.vp.umin.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %t4 = call <8 x i8> @llvm.vp.umin.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 157 for instruction: %t6 = call <16 x i8> @llvm.vp.umin.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t8 = call <2 x i16> @llvm.vp.umin.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t10 = call <4 x i16> @llvm.vp.umin.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %t12 = call <8 x i16> @llvm.vp.umin.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t14 = call <16 x i16> @llvm.vp.umin.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t16 = call <2 x i32> @llvm.vp.umin.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %t18 = call <4 x i32> @llvm.vp.umin.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %t20 = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t22 = call <16 x i32> @llvm.vp.umin.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %t24 = call <2 x i64> @llvm.vp.umin.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %t26 = call <4 x i64> @llvm.vp.umin.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %t28 = call <8 x i64> @llvm.vp.umin.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %t30 = call <16 x i64> @llvm.vp.umin.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.umin.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.umin.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.umin.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.umin.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.umin.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.umin.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.umin.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.umin.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.umin.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.umin.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.umin.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.umin.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.umin.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.umin.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.umin.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.umin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.umin.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.umin.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.umin.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i16> @llvm.vp.umin.v2i16(<2 x i16> undef, <2 x i16> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t10 = call <4 x i16> @llvm.vp.umin.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t12 = call <8 x i16> @llvm.vp.umin.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t14 = call <16 x i16> @llvm.vp.umin.v16i16(<16 x i16> undef, <16 x i16> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <2 x i32> @llvm.vp.umin.v2i32(<2 x i32> undef, <2 x i32> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <4 x i32> @llvm.vp.umin.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> undef, <8 x i32> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = call <16 x i32> @llvm.vp.umin.v16i32(<16 x i32> undef, <16 x i32> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t24 = call <2 x i64> @llvm.vp.umin.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = call <4 x i64> @llvm.vp.umin.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = call <8 x i64> @llvm.vp.umin.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = call <16 x i64> @llvm.vp.umin.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t32 = call <vscale x 2 x i8> @llvm.vp.umin.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t34 = call <vscale x 4 x i8> @llvm.vp.umin.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t36 = call <vscale x 8 x i8> @llvm.vp.umin.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t38 = call <vscale x 16 x i8> @llvm.vp.umin.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t40 = call <vscale x 2 x i16> @llvm.vp.umin.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t42 = call <vscale x 4 x i16> @llvm.vp.umin.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t44 = call <vscale x 8 x i16> @llvm.vp.umin.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t46 = call <vscale x 16 x i16> @llvm.vp.umin.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t48 = call <vscale x 2 x i32> @llvm.vp.umin.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t50 = call <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t52 = call <vscale x 8 x i32> @llvm.vp.umin.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t54 = call <vscale x 16 x i32> @llvm.vp.umin.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t56 = call <vscale x 2 x i64> @llvm.vp.umin.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t58 = call <vscale x 4 x i64> @llvm.vp.umin.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t60 = call <vscale x 8 x i64> @llvm.vp.umin.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t62 = call <vscale x 16 x i64> @llvm.vp.umin.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x i8> @llvm.vp.umin.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef)
@@ -1172,37 +1172,37 @@ define void @abs() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'abs'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %1 = call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 false, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 false, <2 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %3 = call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %5 = call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %7 = call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 2 x i8> @llvm.vp.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 2 x i8> @llvm.abs.nxv2i8(<vscale x 2 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 4 x i8> @llvm.vp.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <vscale x 4 x i8> @llvm.abs.nxv4i8(<vscale x 4 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = call <vscale x 8 x i8> @llvm.vp.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <vscale x 8 x i8> @llvm.abs.nxv8i8(<vscale x 8 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 2 x i64> @llvm.vp.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false, <vscale x 2 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %27 = call <vscale x 4 x i64> @llvm.vp.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false, <vscale x 4 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <vscale x 8 x i64> @llvm.vp.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false, <vscale x 8 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> undef, i1 false)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false, <vscale x 16 x i1> undef, i32 undef)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = call <vscale x 16 x i64> @llvm.abs.nxv16i64(<vscale x 16 x i64> undef, i1 false)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
index 8bef7583c35c0..7d8a25f022e7d 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
@@ -39,3 +39,44 @@ loop:
 exit:
    ret void
 }
+
+; Same as previous test, but with selects replaced by phis in the same block.
+define void @test_phi(ptr noalias %x, ptr noalias %y, ptr noalias %z) {
+; CHECK-LABEL: 'test_phi'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+   %gep.y = getelementptr double, ptr %y, i64 -32
+   br label %loop
+
+loop:
+   %iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
+   %icmp = icmp ule i64 %iv, 32
+   br i1 %icmp, label %if, label %latch
+
+if:
+   br label %latch
+
+latch:
+   %sel = phi ptr [ %x, %if ], [ %gep.y, %loop ]
+   %sel2 = phi ptr [ %y, %if ], [ %z, %loop ]
+   %gep.sel = getelementptr inbounds double, ptr %sel, i64 %iv
+   %load = load double, ptr %gep.sel, align 8
+   %gep.sel2 = getelementptr inbounds double, ptr %sel2, i64 %iv
+   store double %load, ptr %gep.sel2, align 8
+   %iv.next = add nuw nsw i64 %iv, 1
+   %exit.cond = icmp eq i64 %iv, 94
+   br i1 %exit.cond, label %exit, label %loop
+
+exit:
+   ret void
+}
diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/load-size-cache.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/load-size-cache.ll
new file mode 100644
index 0000000000000..388f0217d200d
--- /dev/null
+++ b/llvm/test/Analysis/MemoryDependenceAnalysis/load-size-cache.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=gvn -dom-tree-reachability-max-bbs-to-explore=1 -S < %s | FileCheck %s
+define i8 @f(i1 %arg0, i1 %arg1, i1 %arg2) {
+; CHECK-LABEL: define i8 @f(
+; CHECK-SAME: i1 [[ARG0:%.*]], i1 [[ARG1:%.*]], i1 [[ARG2:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    br i1 [[ARG2]], label %[[BB2:.*]], label %[[BB11:.*]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    br label %[[BB3]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias ptr @_Znwm(i64 2)
+; CHECK-NEXT:    br label %[[BB4:.*]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ null, %[[BB4]] ], [ [[CALL]], %[[BB3]] ]
+; CHECK-NEXT:    store i8 2, ptr [[PHI]], align 4
+; CHECK-NEXT:    br i1 [[ARG0]], label %[[BB5:.*]], label %[[BB4]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    br i1 [[ARG1]], label %[[BB7:.*]], label %[[BB6:.*]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @use(i64 undef)
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br label %[[BB9]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    ret i8 4
+;
+bb:
+  br i1 %arg2, label %bb2, label %bb11
+
+bb11:
+  br label %bb3
+
+bb2:
+  br label %bb3
+
+bb3:
+  %call = tail call noalias ptr @_Znwm(i64 2)
+  br label %bb4
+
+bb4:
+  %phi = phi ptr [ null, %bb4 ], [ %call, %bb3 ]
+  store i8 2, ptr %phi, align 4
+  br i1 %arg0, label %bb5, label %bb4
+
+bb5:
+  br i1 %arg1, label %bb7, label %bb6
+
+bb6:
+  %load = load i64, ptr %call, align 4
+  call void @use(i64 %load)
+  br label %bb9
+
+bb7:
+  %load8 = load i8, ptr %call, align 4
+  br label %bb9
+
+bb9:
+  %phi10 = phi i8 [ %load8, %bb7 ], [ 4, %bb6 ]
+  ret i8 %phi10
+}
+
+declare ptr @_Znwm(i64)
+declare void @use(i64)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index b215fc2c2ae74..00a3aaf77f900 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -261,6 +261,31 @@ bb:
   ret void
 }
 
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <4 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/Assembler/target-type-properties.ll b/llvm/test/Assembler/target-type-properties.ll
index 49c9d812f1cf4..60790dbc5c17b 100644
--- a/llvm/test/Assembler/target-type-properties.ll
+++ b/llvm/test/Assembler/target-type-properties.ll
@@ -1,6 +1,8 @@
 ; RUN: split-file %s %t
 ; RUN: not llvm-as < %t/zeroinit-error.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-ZEROINIT %s
-; RUN: not llvm-as < %t/global-var.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-GLOBALVAR %s
+; RUN: not llvm-as < %t/global-var.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-GLOBAL-VAR %s
+; RUN: not llvm-as < %t/global-array.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-GLOBAL-ARRAY %s
+; RUN: not llvm-as < %t/global-struct.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-GLOBAL-STRUCT %s
 ; Check target extension type properties are verified in the assembler.
 
 ;--- zeroinit-error.ll
@@ -12,5 +14,13 @@ define void @foo() {
 }
 
 ;--- global-var.ll
-@global = external global target("unknown_target_type")
-; CHECK-GLOBALVAR: Global @global has illegal target extension type
+@global_var = external global target("unknown_target_type")
+; CHECK-GLOBAL-VAR: Global @global_var has illegal target extension type
+
+;--- global-array.ll
+@global_array = external global [4 x target("unknown_target_type")]
+; CHECK-GLOBAL-ARRAY: Global @global_array has illegal target extension type
+
+;--- global-struct.ll
+@global_struct = external global {target("unknown_target_type")}
+; CHECK-GLOBAL-STRUCT: Global @global_struct has illegal target extension type
diff --git a/llvm/test/Bitcode/summary_version.ll b/llvm/test/Bitcode/summary_version.ll
index c8d36f812c208..c95c145a08788 100644
--- a/llvm/test/Bitcode/summary_version.ll
+++ b/llvm/test/Bitcode/summary_version.ll
@@ -2,7 +2,7 @@
 ; RUN: opt  -module-summary  %s -o - | llvm-bcanalyzer -dump | FileCheck %s
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK: <VERSION op0=11/>
+; CHECK: <VERSION op0=12/>
 
 
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
index bc4b5ae7c066a..20cba54923548 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -176,3 +176,104 @@ body:             |
     $q1 = COPY %o_wide
     RET_ReallyLR implicit $w0
 ...
+---
+name:            sub_may
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sub_may
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_SSUBO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %sub:_(s32), %o:_(s1) = G_SSUBO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %sub(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            usub_may
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: usub_may
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_USUBO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %sub:_(s32), %o:_(s1) = G_USUBO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %sub(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            usub_may_carry_s11
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: usub_may_carry_s11
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %sub:_(s32), %o:_(s11) = G_USUBO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s11)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %sub:_(s32), %o:_(s11) = G_USUBO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s11)
+    $w0 = COPY %sub(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            usub_may_carry_s11_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: usub_may_carry_s11_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %bv:_(<4 x s32>) = G_BUILD_VECTOR %const(s32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: %bv1:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), %const(s32)
+    ; CHECK-NEXT: %sub:_(<4 x s32>), %o:_(<4 x s11>) = G_USUBO %bv, %bv1
+    ; CHECK-NEXT: %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s11>)
+    ; CHECK-NEXT: $q0 = COPY %sub(<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY %o_wide(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w0
+    %2:_(s32) = COPY $w0
+    %3:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %bv:_(<4 x s32>) = G_BUILD_VECTOR %const(s32), %0(s32), %1(s32), %2(s32)
+    %bv1:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %const(s32)
+    %sub:_(<4 x s32>), %o:_(<4 x s11>) = G_USUBO %bv, %bv1
+    %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s11>)
+    $q0 = COPY %sub(<4 x s32>)
+    $q1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/freeze.ll b/llvm/test/CodeGen/AArch64/GlobalISel/freeze.ll
deleted file mode 100644
index a793ecbf03f65..0000000000000
--- a/llvm/test/CodeGen/AArch64/GlobalISel/freeze.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s 2>&1 | FileCheck %s
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s --check-prefix=GISEL
-
-%struct.T = type { i32, i32 }
-
-define i32 @freeze_int() {
-; CHECK-LABEL: freeze_int:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w0, w8, w8
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_int:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    mul w0, w8, w8
-; GISEL-NEXT:    ret
-  %y1 = freeze i32 undef
-  %t1 = mul i32 %y1, %y1
-  ret i32 %t1
-}
-
-define i5 @freeze_int2() {
-; CHECK-LABEL: freeze_int2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w0, w8, w8
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_int2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    mul w0, w8, w8
-; GISEL-NEXT:    ret
-  %y1 = freeze i5 undef
-  %t1 = mul i5 %y1, %y1
-  ret i5 %t1
-}
-
-define float @freeze_float() {
-; CHECK-LABEL: freeze_float:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fadd s0, s0, s0
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_float:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fadd s0, s0, s0
-; GISEL-NEXT:    ret
-  %y1 = freeze float undef
-  %t1 = fadd float %y1, %y1
-  ret float %t1
-}
-
-define <2 x i32> @freeze_ivec() {
-; CHECK-LABEL: freeze_ivec:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_ivec:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    add v0.2s, v0.2s, v0.2s
-; GISEL-NEXT:    ret
-  %y1 = freeze <2 x i32> undef
-  %t1 = add <2 x i32> %y1, %y1
-  ret <2 x i32> %t1
-}
-
-define ptr @freeze_ptr() {
-; CHECK-LABEL: freeze_ptr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x0, x8, #4
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_ptr:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    add x0, x8, #4
-; GISEL-NEXT:    ret
-  %y1 = freeze ptr undef
-  %t1 = getelementptr i8, ptr %y1, i64 4
-  ret ptr %t1
-}
-
-define i32 @freeze_struct() {
-; CHECK-LABEL: freeze_struct:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w0, w8, w8
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_struct:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    add w0, w8, w8
-; GISEL-NEXT:    ret
-  %y1 = freeze %struct.T undef
-  %v1 = extractvalue %struct.T %y1, 0
-  %v2 = extractvalue %struct.T %y1, 1
-  %t1 = add i32 %v1, %v2
-  ret i32 %t1
-}
-
-define i32 @freeze_anonstruct() {
-; CHECK-LABEL: freeze_anonstruct:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w0, w8, w8
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_anonstruct:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    add w0, w8, w8
-; GISEL-NEXT:    ret
-  %y1 = freeze {i32, i32} undef
-  %v1 = extractvalue {i32, i32} %y1, 0
-  %v2 = extractvalue {i32, i32} %y1, 1
-  %t1 = add i32 %v1, %v2
-  ret i32 %t1
-}
-
-define i32 @freeze_anonstruct2() {
-; CHECK-LABEL: freeze_anonstruct2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w0, w8, w8, uxth
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_anonstruct2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    add w0, w8, w8, uxth
-; GISEL-NEXT:    ret
-  %y1 = freeze {i32, i16} undef
-  %v1 = extractvalue {i32, i16} %y1, 0
-  %v2 = extractvalue {i32, i16} %y1, 1
-  %z2 = zext i16 %v2 to i32
-  %t1 = add i32 %v1, %z2
-  ret i32 %t1
-}
-
-define i64 @freeze_array() {
-; CHECK-LABEL: freeze_array:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x0, x8, x8
-; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: freeze_array:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    add x0, x8, x8
-; GISEL-NEXT:    ret
-  %y1 = freeze [2 x i64] undef
-  %v1 = extractvalue [2 x i64] %y1, 0
-  %v2 = extractvalue [2 x i64] %y1, 1
-  %t1 = add i64 %v1, %v2
-  ret i64 %t1
-}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir
index 323a3993473fc..b0b0e6b322a01 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extract-vector-elt.mir
@@ -7,7 +7,9 @@ body: |
   bb.0:
     liveins: $q0
     ; CHECK-LABEL: name: test_eve_1
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[C]](s64)
     ; CHECK-NEXT: $x0 = COPY [[EVEC]](s64)
@@ -24,7 +26,9 @@ body: |
   bb.0:
     liveins: $q0, $q1
     ; CHECK-LABEL: name: test_eve_v2s1
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<2 x s64>) = G_ICMP intpred(eq), [[COPY]](<2 x s64>), [[COPY1]]
@@ -46,7 +50,9 @@ body: |
   bb.0:
     liveins: $q0, $q1
     ; CHECK-LABEL: name: test_eve_v4s1
-    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), [[COPY]](<4 x s32>), [[COPY1]]
@@ -69,7 +75,9 @@ body: |
   bb.0:
     liveins: $q0, $q1
     ; CHECK-LABEL: name: test_eve_v8s1
-    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s16>) = G_ICMP intpred(eq), [[COPY]](<8 x s16>), [[COPY1]]
@@ -92,7 +100,9 @@ body: |
   bb.0:
     liveins: $q0, $q1
     ; CHECK-LABEL: name: test_eve_v16s1
-    ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<16 x s8>) = G_ICMP intpred(eq), [[COPY]](<16 x s8>), [[COPY1]]
@@ -115,7 +125,9 @@ body: |
   bb.0:
     liveins: $q0, $q1
     ; CHECK-LABEL: name: test_eve_v2p0
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(p0) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x p0>), [[C]](s64)
     ; CHECK-NEXT: $x0 = COPY [[EVEC]](p0)
@@ -132,7 +144,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v4s64
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[C]](s64)
     ; CHECK-NEXT: $x0 = COPY [[EVEC]](s64)
@@ -152,7 +166,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v2s1_unknown_idx
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<2 x s64>) = G_ICMP intpred(eq), [[COPY]](<2 x s64>), [[COPY1]]
@@ -181,7 +197,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v4s1_unknown_idx
-    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), [[COPY]](<4 x s32>), [[COPY1]]
@@ -211,7 +229,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v8s1_unknown_idx
-    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s16>) = G_ICMP intpred(eq), [[COPY]](<8 x s16>), [[COPY1]]
@@ -241,7 +261,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v16s1_unknown_idx
-    ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<16 x s8>) = G_ICMP intpred(eq), [[COPY]](<16 x s8>), [[COPY1]]
@@ -271,7 +293,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v2p0_unknown_idx
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[COPY]](<2 x p0>)
@@ -296,7 +320,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v4s64_unknown_idx
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
     ; CHECK-NEXT: %idx:_(s64) = COPY $x0
     ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
@@ -326,7 +352,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v8s32
-    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C]](s64)
     ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32)
@@ -346,7 +374,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $x0
     ; CHECK-LABEL: name: test_eve_v16s16
-    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q1
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q1
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s16>), [[C]](s64)
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT [[EVEC]](s16)
@@ -368,10 +398,11 @@ body: |
   bb.0:
     liveins: $x0
     ; CHECK-LABEL: name: test_eve_v4p0
-    ; CHECK: %vec:_(<4 x p0>) = G_IMPLICIT_DEF
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x p0>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: %idx:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x p0>), [[UV1:%[0-9]+]]:_(<2 x p0>) = G_UNMERGE_VALUES %vec(<4 x p0>)
-    ; CHECK-NEXT: %eve:_(p0) = G_EXTRACT_VECTOR_ELT [[UV]](<2 x p0>), %idx(s64)
+    ; CHECK-NEXT: %eve:_(p0) = G_EXTRACT_VECTOR_ELT [[DEF]](<2 x p0>), %idx(s64)
     ; CHECK-NEXT: $x0 = COPY %eve(p0)
     ; CHECK-NEXT: RET_ReallyLR
     %vec:_(<4 x p0>) = G_IMPLICIT_DEF
@@ -386,7 +417,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $w0
     ; CHECK-LABEL: name: test_eve_v8s32_unknown_idx
-    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: %idx:_(s32) = COPY $w0
     ; CHECK-NEXT: %idxprom:_(s64) = G_SEXT %idx(s32)
@@ -418,7 +451,9 @@ body: |
   bb.0:
     liveins: $q0, $q1, $w0
     ; CHECK-LABEL: name: test_eve_v16s16_unknown_idx
-    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
+    ; CHECK: liveins: $q0, $q1, $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1
     ; CHECK-NEXT: %idx:_(s32) = COPY $w0
     ; CHECK-NEXT: %idxprom:_(s64) = G_SEXT %idx(s32)
@@ -452,15 +487,16 @@ body: |
   bb.0:
     liveins: $x0
     ; CHECK-LABEL: name: test_eve_v4p0_unknown_idx
-    ; CHECK: %vec:_(<4 x p0>) = G_IMPLICIT_DEF
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x p0>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: %idx:_(s64) = COPY $x0
     ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x p0>), [[UV1:%[0-9]+]]:_(<2 x p0>) = G_UNMERGE_VALUES %vec(<4 x p0>)
-    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[UV]](<2 x p0>)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[DEF]](<2 x p0>)
     ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), [[FRAME_INDEX]](p0) :: (store (<2 x s64>) into %stack.0, align 32)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s64)
-    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[UV1]](<2 x p0>)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[DEF]](<2 x p0>)
     ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into %stack.0 + 16, basealign 32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND %idx, [[C1]]
@@ -477,15 +513,17 @@ body: |
     RET_ReallyLR
 ...
 ---
+# Make sure that the pointer legalization rules don't apply when we have
+# different address spaces.
 name:            cant_legalize_different_address_spaces
 body: |
   bb.0:
     liveins: $x0
-    ; Make sure that the pointer legalization rules don't apply when we have
-    ; different address spaces.
 
     ; CHECK-LABEL: name: cant_legalize_different_address_spaces
-    ; CHECK: %vec:_(<4 x p1>) = G_IMPLICIT_DEF
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %vec:_(<4 x p1>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: %idx:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: %eve:_(p0) = G_EXTRACT_VECTOR_ELT %vec(<4 x p1>), %idx(s64)
     ; CHECK-NEXT: $x0 = COPY %eve(p0)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 69a69dbd3b18b..0fe2bbe2c449f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -231,29 +231,24 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldr d0, [x1]
-; CHECK-NEON-NEXT:    ldrh w9, [x0]
-; CHECK-NEON-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEON-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEON-NEXT:    fmov x11, d0
-; CHECK-NEON-NEXT:    mov x8, v0.d[1]
-; CHECK-NEON-NEXT:    smull x9, w9, w11
-; CHECK-NEON-NEXT:    smull x8, w10, w8
-; CHECK-NEON-NEXT:    fmov d0, x9
-; CHECK-NEON-NEXT:    mov v0.d[1], x8
+; CHECK-NEON-NEXT:    ldrh w8, [x0]
+; CHECK-NEON-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    fmov d0, x8
+; CHECK-NEON-NEXT:    mov v0.d[1], x9
+; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
 ; CHECK-SVE-NEXT:    ldrh w8, [x0]
 ; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
-; CHECK-SVE-NEXT:    ptrue p0.d, vl2
-; CHECK-SVE-NEXT:    ldr d0, [x1]
-; CHECK-SVE-NEXT:    fmov d1, x8
-; CHECK-SVE-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-SVE-NEXT:    mov v1.d[1], x9
-; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    fmov d0, x8
+; CHECK-SVE-NEXT:    mov v0.d[1], x9
+; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
@@ -2404,25 +2399,16 @@ define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) {
 define <2 x i64> @lsr(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEON-LABEL: lsr:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ushr v0.2d, v0.2d, #32
-; CHECK-NEON-NEXT:    ushr v1.2d, v1.2d, #32
-; CHECK-NEON-NEXT:    fmov x10, d1
-; CHECK-NEON-NEXT:    fmov x11, d0
-; CHECK-NEON-NEXT:    mov x8, v1.d[1]
-; CHECK-NEON-NEXT:    mov x9, v0.d[1]
-; CHECK-NEON-NEXT:    umull x10, w11, w10
-; CHECK-NEON-NEXT:    umull x8, w9, w8
-; CHECK-NEON-NEXT:    fmov d0, x10
-; CHECK-NEON-NEXT:    mov v0.d[1], x8
+; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEON-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: lsr:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ushr v0.2d, v0.2d, #32
-; CHECK-SVE-NEXT:    ushr v1.2d, v1.2d, #32
-; CHECK-SVE-NEXT:    ptrue p0.d, vl2
-; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-SVE-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: lsr:
@@ -2481,25 +2467,16 @@ define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) {
 define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEON-LABEL: asr:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    sshr v0.2d, v0.2d, #32
-; CHECK-NEON-NEXT:    sshr v1.2d, v1.2d, #32
-; CHECK-NEON-NEXT:    fmov x10, d1
-; CHECK-NEON-NEXT:    fmov x11, d0
-; CHECK-NEON-NEXT:    mov x8, v1.d[1]
-; CHECK-NEON-NEXT:    mov x9, v0.d[1]
-; CHECK-NEON-NEXT:    smull x10, w11, w10
-; CHECK-NEON-NEXT:    smull x8, w9, w8
-; CHECK-NEON-NEXT:    fmov d0, x10
-; CHECK-NEON-NEXT:    mov v0.d[1], x8
+; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEON-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: asr:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    sshr v0.2d, v0.2d, #32
-; CHECK-SVE-NEXT:    sshr v1.2d, v1.2d, #32
-; CHECK-SVE-NEXT:    ptrue p0.d, vl2
-; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-SVE-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: asr:
@@ -2524,25 +2501,16 @@ define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) {
 define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEON-LABEL: asr_const:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    sshr v0.2d, v0.2d, #32
-; CHECK-NEON-NEXT:    fmov x9, d0
-; CHECK-NEON-NEXT:    mov x8, v0.d[1]
-; CHECK-NEON-NEXT:    lsl x10, x9, #5
-; CHECK-NEON-NEXT:    lsl x11, x8, #5
-; CHECK-NEON-NEXT:    sub x9, x10, x9
-; CHECK-NEON-NEXT:    fmov d0, x9
-; CHECK-NEON-NEXT:    sub x8, x11, x8
-; CHECK-NEON-NEXT:    mov v0.d[1], x8
+; CHECK-NEON-NEXT:    movi v1.2s, #31
+; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: asr_const:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    mov w8, #31 // =0x1f
-; CHECK-SVE-NEXT:    sshr v0.2d, v0.2d, #32
-; CHECK-SVE-NEXT:    ptrue p0.d, vl2
-; CHECK-SVE-NEXT:    dup v1.2d, x8
-; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    movi v1.2s, #31
+; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: asr_const:
diff --git a/llvm/test/CodeGen/AArch64/cfi-fixup-multi-section.mir b/llvm/test/CodeGen/AArch64/cfi-fixup-multi-section.mir
new file mode 100644
index 0000000000000..a24972d138832
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cfi-fixup-multi-section.mir
@@ -0,0 +1,200 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=cfi-fixup %s -o - | FileCheck %s
+--- |
+  define i32 @f0(i32 %x) #0 {
+  entry: br label %return
+  if.end: br label %return
+  if.then2: br label %return
+  if.else: br label %return
+  return:
+    ret i32 0
+  }
+
+  declare i32 @g(i32)
+
+  attributes #0 = { nounwind shadowcallstack uwtable "sign-return-address"="non-leaf" "target-features"="+reserve-x18" }
+
+...
+---
+name:            f0
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+failsVerification: false
+registers:       []
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       16
+  offsetAdjustment: 0
+  maxAlignment:    16
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  ; CHECK-LABEL: name: f0
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.4(0x30000000), %bb.1(0x50000000)
+  ; CHECK-NEXT:   liveins: $w0, $lr, $x18
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.end:
+  ; CHECK-NEXT:   successors: %bb.3(0x30000000), %bb.2(0x50000000)
+  ; CHECK-NEXT:   liveins: $w0, $lr, $x18
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   early-clobber $x18 = frame-setup STRXpost $lr, $x18, 8
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION escape 0x16, 0x12, 0x02, 0x82, 0x78
+  ; CHECK-NEXT:   frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION negate_ra_sign_state
+  ; CHECK-NEXT:   early-clobber $sp = frame-setup STRXpre killed $lr, $sp, -16 :: (store (s64) into %stack.0)
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $w30, -16
+  ; CHECK-NEXT:   CFI_INSTRUCTION remember_state
+  ; CHECK-NEXT:   TBNZW renamable $w0, 31, %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.else:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w0 = nuw nsw ADDWri killed renamable $w0, 1, 0
+  ; CHECK-NEXT:   BL @g, csr_aarch64_aapcs_scs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   renamable $w8 = MOVZWi 1, 0
+  ; CHECK-NEXT:   $w0 = SUBWrs killed renamable $w8, killed renamable $w0, 0
+  ; CHECK-NEXT:   B %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.if.then2 (bbsections 1):
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION escape 0x16, 0x12, 0x02, 0x82, 0x78
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION negate_ra_sign_state
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $w30, -16
+  ; CHECK-NEXT:   renamable $w0 = nsw SUBWri killed renamable $w0, 1, 0
+  ; CHECK-NEXT:   BL @g, csr_aarch64_aapcs_scs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   renamable $w0 = nsw ADDWri killed renamable $w0, 1, 0
+  ; CHECK-NEXT:   B %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.return:
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.return:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CFI_INSTRUCTION restore_state
+  ; CHECK-NEXT:   CFI_INSTRUCTION remember_state
+  ; CHECK-NEXT:   B %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.return:
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.0)
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; CHECK-NEXT:   frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+  ; CHECK-NEXT:   early-clobber $x18, $lr = frame-destroy LDRXpre $x18, -8
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w18
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w30
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.return:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CFI_INSTRUCTION restore_state
+  ; CHECK-NEXT:   B %bb.6
+  bb.0.entry:
+    successors: %bb.4(0x30000000), %bb.1(0x50000000)
+    liveins: $w0, $lr, $x18
+
+    CBZW renamable $w0, %bb.4
+
+  bb.1.if.end:
+    successors: %bb.3(0x30000000), %bb.2(0x50000000)
+    liveins: $w0, $lr, $x18
+
+    early-clobber $x18 = frame-setup STRXpost $lr, $x18, 8
+    frame-setup CFI_INSTRUCTION escape 0x16, 0x12, 0x02, 0x82, 0x78
+    frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    frame-setup CFI_INSTRUCTION negate_ra_sign_state
+    early-clobber $sp = frame-setup STRXpre killed $lr, $sp, -16 :: (store (s64) into %stack.0)
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup CFI_INSTRUCTION offset $w30, -16
+    TBNZW renamable $w0, 31, %bb.3
+
+  bb.2.if.else:
+    successors: %bb.5(0x80000000)
+    liveins: $w0
+
+    renamable $w0 = nuw nsw ADDWri killed renamable $w0, 1, 0
+    BL @g, csr_aarch64_aapcs_scs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0
+    renamable $w8 = MOVZWi 1, 0
+    $w0 = SUBWrs killed renamable $w8, killed renamable $w0, 0
+    B %bb.5
+
+  bb.3.if.then2 (bbsections 1):
+    successors: %bb.5(0x80000000)
+    liveins: $w0
+
+    renamable $w0 = nsw SUBWri killed renamable $w0, 1, 0
+    BL @g, csr_aarch64_aapcs_scs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0
+    renamable $w0 = nsw ADDWri killed renamable $w0, 1, 0
+    B %bb.5
+
+  bb.4.return:
+    liveins: $w0
+    RET undef $lr, implicit killed $w0
+
+  bb.5.return:
+    liveins: $w0
+    B %bb.6
+
+  bb.7.return:
+    liveins: $w0
+    early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.0)
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+    frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+    early-clobber $x18, $lr = frame-destroy LDRXpre $x18, -8
+    frame-destroy CFI_INSTRUCTION restore $w18
+    frame-destroy CFI_INSTRUCTION restore $w30
+    RET undef $lr, implicit killed $w0
+
+  bb.6.return:
+    liveins: $w0
+    B %bb.7
+
+
+...
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
new file mode 100644
index 0000000000000..6efd9f40f0068
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -0,0 +1,398 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for freeze_v2i8
+
+%struct.T = type { i32, i32 }
+
+define i32 @freeze_int() {
+; CHECK-LABEL: freeze_int:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul w0, w8, w8
+; CHECK-NEXT:    ret
+  %y1 = freeze i32 undef
+  %t1 = mul i32 %y1, %y1
+  ret i32 %t1
+}
+
+define i5 @freeze_int2() {
+; CHECK-LABEL: freeze_int2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul w0, w8, w8
+; CHECK-NEXT:    ret
+  %y1 = freeze i5 undef
+  %t1 = mul i5 %y1, %y1
+  ret i5 %t1
+}
+
+define float @freeze_float() {
+; CHECK-LABEL: freeze_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s0
+; CHECK-NEXT:    ret
+  %y1 = freeze float undef
+  %t1 = fadd float %y1, %y1
+  ret float %t1
+}
+
+define <2 x i8> @freeze_v2i8() {
+; CHECK-LABEL: freeze_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %y1 = freeze <2 x i8> undef
+  %t1 = add <2 x i8> %y1, %y1
+  ret <2 x i8> %t1
+}
+
+define <3 x i8> @freeze_v3i8() {
+; CHECK-SD-LABEL: freeze_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    umov w0, v0.h[0]
+; CHECK-SD-NEXT:    umov w1, v0.h[1]
+; CHECK-SD-NEXT:    umov w2, v0.h[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov b0, v0.b[1]
+; CHECK-GI-NEXT:    mov b1, v0.b[2]
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    umov w0, v0.h[0]
+; CHECK-GI-NEXT:    umov w1, v0.h[1]
+; CHECK-GI-NEXT:    umov w2, v0.h[2]
+; CHECK-GI-NEXT:    ret
+  %y1 = freeze <3 x i8> undef
+  %t1 = add <3 x i8> %y1, %y1
+  ret <3 x i8> %t1
+}
+
+define <4 x i8> @freeze_v4i8() {
+; CHECK-SD-LABEL: freeze_v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov b0, v0.b[1]
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov b1, v0.b[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov b2, v0.b[3]
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    ret
+  %y1 = freeze <4 x i8> undef
+  %t1 = add <4 x i8> %y1, %y1
+  ret <4 x i8> %t1
+}
+
+define <8 x i8> @freeze_v8i8() {
+; CHECK-LABEL: freeze_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.8b, v0.8b, v0.8b
+; CHECK-NEXT:    ret
+  %y1 = freeze <8 x i8> undef
+  %t1 = add <8 x i8> %y1, %y1
+  ret <8 x i8> %t1
+}
+
+define <16 x i8> @freeze_v16i8() {
+; CHECK-LABEL: freeze_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %y1 = freeze <16 x i8> undef
+  %t1 = add <16 x i8> %y1, %y1
+  ret <16 x i8> %t1
+}
+
+define <32 x i8> @freeze_v32i8() {
+; CHECK-LABEL: freeze_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %y1 = freeze <32 x i8> undef
+  %t1 = add <32 x i8> %y1, %y1
+  ret <32 x i8> %t1
+}
+
+define <2 x i16> @freeze_v2i16() {
+; CHECK-SD-LABEL: freeze_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov h0, v0.h[1]
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    add v0.2s, v1.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+  %y1 = freeze <2 x i16> undef
+  %t1 = add <2 x i16> %y1, %y1
+  ret <2 x i16> %t1
+}
+
+define <3 x i16> @freeze_v3i16() {
+; CHECK-LABEL: freeze_v3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %y1 = freeze <3 x i16> undef
+  %t1 = add <3 x i16> %y1, %y1
+  ret <3 x i16> %t1
+}
+
+define <4 x i16> @freeze_v4i16() {
+; CHECK-LABEL: freeze_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %y1 = freeze <4 x i16> undef
+  %t1 = add <4 x i16> %y1, %y1
+  ret <4 x i16> %t1
+}
+
+define <8 x i16> @freeze_v8i16() {
+; CHECK-LABEL: freeze_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    ret
+  %y1 = freeze <8 x i16> undef
+  %t1 = add <8 x i16> %y1, %y1
+  ret <8 x i16> %t1
+}
+
+define <16 x i16> @freeze_v16i16() {
+; CHECK-LABEL: freeze_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %y1 = freeze <16 x i16> undef
+  %t1 = add <16 x i16> %y1, %y1
+  ret <16 x i16> %t1
+}
+
+define <2 x i32> @freeze_v2i32() {
+; CHECK-LABEL: freeze_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %y1 = freeze <2 x i32> undef
+  %t1 = add <2 x i32> %y1, %y1
+  ret <2 x i32> %t1
+}
+
+define <3 x i32> @freeze_v3i32() {
+; CHECK-LABEL: freeze_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %y1 = freeze <3 x i32> undef
+  %t1 = add <3 x i32> %y1, %y1
+  ret <3 x i32> %t1
+}
+
+define <4 x i32> @freeze_v4i32() {
+; CHECK-LABEL: freeze_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %y1 = freeze <4 x i32> undef
+  %t1 = add <4 x i32> %y1, %y1
+  ret <4 x i32> %t1
+}
+
+define <8 x i32> @freeze_v8i32() {
+; CHECK-LABEL: freeze_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %y1 = freeze <8 x i32> undef
+  %t1 = add <8 x i32> %y1, %y1
+  ret <8 x i32> %t1
+}
+
+define <2 x i64> @freeze_v2i64() {
+; CHECK-LABEL: freeze_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %y1 = freeze <2 x i64> undef
+  %t1 = add <2 x i64> %y1, %y1
+  ret <2 x i64> %t1
+}
+
+define <3 x i64> @freeze_v3i64() {
+; CHECK-SD-LABEL: freeze_v3i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_v3i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-GI-NEXT:    add x8, x8, x8
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+  %y1 = freeze <3 x i64> undef
+  %t1 = add <3 x i64> %y1, %y1
+  ret <3 x i64> %t1
+}
+
+define <4 x i64> @freeze_v4i64() {
+; CHECK-LABEL: freeze_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %y1 = freeze <4 x i64> undef
+  %t1 = add <4 x i64> %y1, %y1
+  ret <4 x i64> %t1
+}
+
+define <2 x ptr> @freeze_v2p0() {
+; CHECK-SD-LABEL: freeze_v2p0:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #4 // =0x4
+; CHECK-SD-NEXT:    dup v0.2d, x8
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_v2p0:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI21_0
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI21_0]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-GI-NEXT:    ret
+  %y1 = freeze <2 x ptr> undef
+  %t1 = getelementptr i32, <2 x ptr> %y1, i32 1
+  ret <2 x ptr> %t1
+}
+
+define <3 x ptr> @freeze_v3p0() {
+; CHECK-SD-LABEL: freeze_v3p0:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #4 // =0x4
+; CHECK-SD-NEXT:    dup v2.2d, x8
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    add d2, d0, d2
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_v3p0:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI22_0
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI22_0]
+; CHECK-GI-NEXT:    add x8, x8, #4
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    ret
+  %y1 = freeze <3 x ptr> undef
+  %t1 = getelementptr i32, <3 x ptr> %y1, i32 1
+  ret <3 x ptr> %t1
+}
+
+define <4 x ptr> @freeze_v4p0() {
+; CHECK-SD-LABEL: freeze_v4p0:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #4 // =0x4
+; CHECK-SD-NEXT:    dup v0.2d, x8
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_v4p0:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI23_0
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+  %y1 = freeze <4 x ptr> undef
+  %t1 = getelementptr i32, <4 x ptr> %y1, i32 1
+  ret <4 x ptr> %t1
+}
+
+define ptr @freeze_ptr() {
+; CHECK-LABEL: freeze_ptr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x0, x8, #4
+; CHECK-NEXT:    ret
+  %y1 = freeze ptr undef
+  %t1 = getelementptr i8, ptr %y1, i64 4
+  ret ptr %t1
+}
+
+define i32 @freeze_struct() {
+; CHECK-LABEL: freeze_struct:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w0, w8, w8
+; CHECK-NEXT:    ret
+  %y1 = freeze %struct.T undef
+  %v1 = extractvalue %struct.T %y1, 0
+  %v2 = extractvalue %struct.T %y1, 1
+  %t1 = add i32 %v1, %v2
+  ret i32 %t1
+}
+
+define i32 @freeze_anonstruct() {
+; CHECK-LABEL: freeze_anonstruct:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w0, w8, w8
+; CHECK-NEXT:    ret
+  %y1 = freeze {i32, i32} undef
+  %v1 = extractvalue {i32, i32} %y1, 0
+  %v2 = extractvalue {i32, i32} %y1, 1
+  %t1 = add i32 %v1, %v2
+  ret i32 %t1
+}
+
+define i32 @freeze_anonstruct2() {
+; CHECK-LABEL: freeze_anonstruct2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w0, w8, w8, uxth
+; CHECK-NEXT:    ret
+  %y1 = freeze {i32, i16} undef
+  %v1 = extractvalue {i32, i16} %y1, 0
+  %v2 = extractvalue {i32, i16} %y1, 1
+  %z2 = zext i16 %v2 to i32
+  %t1 = add i32 %v1, %z2
+  ret i32 %t1
+}
+
+define i64 @freeze_array() {
+; CHECK-LABEL: freeze_array:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x0, x8, x8
+; CHECK-NEXT:    ret
+  %y1 = freeze [2 x i64] undef
+  %v1 = extractvalue [2 x i64] %y1, 0
+  %v2 = extractvalue [2 x i64] %y1, 1
+  %t1 = add i64 %v1, %v2
+  ret i64 %t1
+}
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index e8dafd5e8fbab..932a5af264a00 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -497,6 +497,35 @@ for.exit:                                 ; preds = %for.body
   ret i64 %spec.select
 }
 
+@a = external local_unnamed_addr global i32, align 4
+
+; FIXME: Load hoisted out of the loop across memory barriers.
+define i32 @load_between_memory_barriers() {
+; CHECK-LABEL: load_between_memory_barriers:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, :got:a
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:a]
+; CHECK-NEXT:    ldr w0, [x8]
+; CHECK-NEXT:  .LBB8_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    //MEMBARRIER
+; CHECK-NEXT:    //MEMBARRIER
+; CHECK-NEXT:    cbz w0, .LBB8_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+  br label %loop
+
+loop:
+  fence syncscope("singlethread") acq_rel
+  %l = load i32, ptr @a, align 4
+  fence syncscope("singlethread") acq_rel
+  %c = icmp eq i32 %l, 0
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret i32 %l
+}
+
 declare i32 @bcmp(ptr, ptr, i64)
 declare i32 @memcmp(ptr, ptr, i64)
 declare void @func()
diff --git a/llvm/test/CodeGen/AArch64/phi.ll b/llvm/test/CodeGen/AArch64/phi.ll
index 402c7eeabb291..eeafbaffbcc69 100644
--- a/llvm/test/CodeGen/AArch64/phi.ll
+++ b/llvm/test/CodeGen/AArch64/phi.ll
@@ -161,14 +161,45 @@ e:
     ret i128 %h
 }
 
+define ptr @tp0(i1 %c, ptr %p, ptr %a, ptr %b) {
+; CHECK-SD-LABEL: tp0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB5_2
+; CHECK-SD-NEXT:  // %bb.1: // %t
+; CHECK-SD-NEXT:    mov x3, x2
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:  .LBB5_2: // %e
+; CHECK-SD-NEXT:    mov x0, x3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tp0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x0, x3
+; CHECK-GI-NEXT:    tbz w8, #0, .LBB5_2
+; CHECK-GI-NEXT:  // %bb.1: // %t
+; CHECK-GI-NEXT:    mov x0, x2
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:  .LBB5_2: // %e
+; CHECK-GI-NEXT:    ret
+entry:
+    br i1 %c, label %t, label %e
+t:
+    store i32 0, ptr %p
+    br label %e
+e:
+    %h = phi ptr [%a, %t], [%b, %entry]
+    ret ptr %h
+}
+
 define half @tf16(i1 %c, ptr %p, half %a, half %b) {
 ; CHECK-LABEL: tf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB5_2
+; CHECK-NEXT:    tbz w0, #0, .LBB6_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov s1, s0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB5_2: // %e
+; CHECK-NEXT:  .LBB6_2: // %e
 ; CHECK-NEXT:    fmov s0, s1
 ; CHECK-NEXT:    ret
 entry:
@@ -184,11 +215,11 @@ e:
 define float @tf32(i1 %c, ptr %p, float %a, float %b) {
 ; CHECK-LABEL: tf32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB6_2
+; CHECK-NEXT:    tbz w0, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov s1, s0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB6_2: // %e
+; CHECK-NEXT:  .LBB7_2: // %e
 ; CHECK-NEXT:    fmov s0, s1
 ; CHECK-NEXT:    ret
 entry:
@@ -204,11 +235,11 @@ e:
 define double @tf64(i1 %c, ptr %p, double %a, double %b) {
 ; CHECK-LABEL: tf64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB7_2
+; CHECK-NEXT:    tbz w0, #0, .LBB8_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB7_2: // %e
+; CHECK-NEXT:  .LBB8_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -224,25 +255,25 @@ e:
 define fp128 @tf128(i1 %c, ptr %p, fp128 %a, fp128 %b) {
 ; CHECK-SD-LABEL: tf128:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB8_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB9_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v1.16b, v0.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB8_2: // %e
+; CHECK-SD-NEXT:  .LBB9_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: tf128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB8_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB9_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
 ; CHECK-GI-NEXT:    fmov d1, d0
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:    b .LBB8_3
-; CHECK-GI-NEXT:  .LBB8_2:
+; CHECK-GI-NEXT:    b .LBB9_3
+; CHECK-GI-NEXT:  .LBB9_2:
 ; CHECK-GI-NEXT:    mov d2, v1.d[1]
-; CHECK-GI-NEXT:  .LBB8_3: // %e
+; CHECK-GI-NEXT:  .LBB9_3: // %e
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
@@ -261,11 +292,11 @@ e:
 define <2 x i8> @tv2i8(i1 %c, ptr %p, <2 x i8> %a, <2 x i8> %b) {
 ; CHECK-SD-LABEL: tv2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB9_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB10_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    fmov d1, d0
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB9_2: // %e
+; CHECK-SD-NEXT:  .LBB10_2: // %e
 ; CHECK-SD-NEXT:    fmov d0, d1
 ; CHECK-SD-NEXT:    ret
 ;
@@ -273,17 +304,17 @@ define <2 x i8> @tv2i8(i1 %c, ptr %p, <2 x i8> %a, <2 x i8> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB9_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB10_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    str wzr, [x1]
 ; CHECK-GI-NEXT:    mov v0.b[1], w8
 ; CHECK-GI-NEXT:    fmov d1, d0
-; CHECK-GI-NEXT:    b .LBB9_3
-; CHECK-GI-NEXT:  .LBB9_2:
+; CHECK-GI-NEXT:    b .LBB10_3
+; CHECK-GI-NEXT:  .LBB10_2:
 ; CHECK-GI-NEXT:    mov w8, v1.s[1]
 ; CHECK-GI-NEXT:    mov v1.b[1], w8
-; CHECK-GI-NEXT:  .LBB9_3: // %e
+; CHECK-GI-NEXT:  .LBB10_3: // %e
 ; CHECK-GI-NEXT:    umov w8, v1.b[0]
 ; CHECK-GI-NEXT:    umov w9, v1.b[1]
 ; CHECK-GI-NEXT:    mov v0.s[0], w8
@@ -303,13 +334,13 @@ e:
 define <3 x i8> @tv3i8(i1 %c, ptr %p, <3 x i8> %a, <3 x i8> %b) {
 ; CHECK-SD-LABEL: tv3i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB10_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB11_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov w5, w2
 ; CHECK-SD-NEXT:    mov w6, w3
 ; CHECK-SD-NEXT:    mov w7, w4
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB10_2: // %e
+; CHECK-SD-NEXT:  .LBB11_2: // %e
 ; CHECK-SD-NEXT:    mov w0, w5
 ; CHECK-SD-NEXT:    mov w1, w6
 ; CHECK-SD-NEXT:    mov w2, w7
@@ -317,18 +348,18 @@ define <3 x i8> @tv3i8(i1 %c, ptr %p, <3 x i8> %a, <3 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: tv3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB10_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB11_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    fmov s0, w2
 ; CHECK-GI-NEXT:    str wzr, [x1]
 ; CHECK-GI-NEXT:    mov v0.b[1], w3
 ; CHECK-GI-NEXT:    mov v0.b[2], w4
-; CHECK-GI-NEXT:    b .LBB10_3
-; CHECK-GI-NEXT:  .LBB10_2:
+; CHECK-GI-NEXT:    b .LBB11_3
+; CHECK-GI-NEXT:  .LBB11_2:
 ; CHECK-GI-NEXT:    fmov s0, w5
 ; CHECK-GI-NEXT:    mov v0.b[1], w6
 ; CHECK-GI-NEXT:    mov v0.b[2], w7
-; CHECK-GI-NEXT:  .LBB10_3: // %e
+; CHECK-GI-NEXT:  .LBB11_3: // %e
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]
 ; CHECK-GI-NEXT:    umov w2, v0.b[2]
@@ -346,24 +377,24 @@ e:
 define <4 x i8> @tv4i8(i1 %c, ptr %p, <4 x i8> %a, <4 x i8> %b) {
 ; CHECK-SD-LABEL: tv4i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB11_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB12_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    fmov d1, d0
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB11_2: // %e
+; CHECK-SD-NEXT:  .LBB12_2: // %e
 ; CHECK-SD-NEXT:    fmov d0, d1
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: tv4i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB11_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB12_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-GI-NEXT:    str wzr, [x1]
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
-; CHECK-GI-NEXT:  .LBB11_2:
+; CHECK-GI-NEXT:  .LBB12_2:
 ; CHECK-GI-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -381,11 +412,11 @@ e:
 define <8 x i8> @tv8i8(i1 %c, ptr %p, <8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: tv8i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB12_2
+; CHECK-NEXT:    tbz w0, #0, .LBB13_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB12_2: // %e
+; CHECK-NEXT:  .LBB13_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -401,11 +432,11 @@ e:
 define <16 x i8> @tv16i8(i1 %c, ptr %p, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: tv16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB13_2
+; CHECK-NEXT:    tbz w0, #0, .LBB14_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB13_2: // %e
+; CHECK-NEXT:  .LBB14_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -421,12 +452,12 @@ e:
 define <32 x i8> @tv32i8(i1 %c, ptr %p, <32 x i8> %a, <32 x i8> %b) {
 ; CHECK-SD-LABEL: tv32i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB14_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB15_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB14_2: // %e
+; CHECK-SD-NEXT:  .LBB15_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v3.16b
 ; CHECK-SD-NEXT:    ret
@@ -435,12 +466,12 @@ define <32 x i8> @tv32i8(i1 %c, ptr %p, <32 x i8> %a, <32 x i8> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB14_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB15_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v1.16b
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:  .LBB14_2: // %e
+; CHECK-GI-NEXT:  .LBB15_2: // %e
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -456,24 +487,24 @@ e:
 define <2 x i16> @tv2i16(i1 %c, ptr %p, <2 x i16> %a, <2 x i16> %b) {
 ; CHECK-SD-LABEL: tv2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB15_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB16_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    fmov d1, d0
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB15_2: // %e
+; CHECK-SD-NEXT:  .LBB16_2: // %e
 ; CHECK-SD-NEXT:    fmov d0, d1
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: tv2i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB15_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB16_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
 ; CHECK-GI-NEXT:    str wzr, [x1]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
-; CHECK-GI-NEXT:  .LBB15_2:
+; CHECK-GI-NEXT:  .LBB16_2:
 ; CHECK-GI-NEXT:    uzp1 v0.4h, v1.4h, v0.4h
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -491,11 +522,11 @@ e:
 define <3 x i16> @tv3i16(i1 %c, ptr %p, <3 x i16> %a, <3 x i16> %b) {
 ; CHECK-LABEL: tv3i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB16_2
+; CHECK-NEXT:    tbz w0, #0, .LBB17_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB16_2: // %e
+; CHECK-NEXT:  .LBB17_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -511,11 +542,11 @@ e:
 define <4 x i16> @tv4i16(i1 %c, ptr %p, <4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: tv4i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB17_2
+; CHECK-NEXT:    tbz w0, #0, .LBB18_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB17_2: // %e
+; CHECK-NEXT:  .LBB18_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -531,11 +562,11 @@ e:
 define <8 x i16> @tv8i16(i1 %c, ptr %p, <8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: tv8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB18_2
+; CHECK-NEXT:    tbz w0, #0, .LBB19_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB18_2: // %e
+; CHECK-NEXT:  .LBB19_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -551,12 +582,12 @@ e:
 define <16 x i16> @tv16i16(i1 %c, ptr %p, <16 x i16> %a, <16 x i16> %b) {
 ; CHECK-SD-LABEL: tv16i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB19_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB20_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB19_2: // %e
+; CHECK-SD-NEXT:  .LBB20_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v3.16b
 ; CHECK-SD-NEXT:    ret
@@ -565,12 +596,12 @@ define <16 x i16> @tv16i16(i1 %c, ptr %p, <16 x i16> %a, <16 x i16> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB19_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB20_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v1.16b
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:  .LBB19_2: // %e
+; CHECK-GI-NEXT:  .LBB20_2: // %e
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -586,11 +617,11 @@ e:
 define <2 x i32> @tv2i32(i1 %c, ptr %p, <2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: tv2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB20_2
+; CHECK-NEXT:    tbz w0, #0, .LBB21_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB20_2: // %e
+; CHECK-NEXT:  .LBB21_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -606,11 +637,11 @@ e:
 define <3 x i32> @tv3i32(i1 %c, ptr %p, <3 x i32> %a, <3 x i32> %b) {
 ; CHECK-LABEL: tv3i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB21_2
+; CHECK-NEXT:    tbz w0, #0, .LBB22_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB21_2: // %e
+; CHECK-NEXT:  .LBB22_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -626,11 +657,11 @@ e:
 define <4 x i32> @tv4i32(i1 %c, ptr %p, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: tv4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB22_2
+; CHECK-NEXT:    tbz w0, #0, .LBB23_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB22_2: // %e
+; CHECK-NEXT:  .LBB23_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -646,12 +677,12 @@ e:
 define <8 x i32> @tv8i32(i1 %c, ptr %p, <8 x i32> %a, <8 x i32> %b) {
 ; CHECK-SD-LABEL: tv8i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB23_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB24_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB23_2: // %e
+; CHECK-SD-NEXT:  .LBB24_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v3.16b
 ; CHECK-SD-NEXT:    ret
@@ -660,12 +691,12 @@ define <8 x i32> @tv8i32(i1 %c, ptr %p, <8 x i32> %a, <8 x i32> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB23_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB24_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v1.16b
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:  .LBB23_2: // %e
+; CHECK-GI-NEXT:  .LBB24_2: // %e
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -681,11 +712,11 @@ e:
 define <2 x i64> @tv2i64(i1 %c, ptr %p, <2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: tv2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB24_2
+; CHECK-NEXT:    tbz w0, #0, .LBB25_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB24_2: // %e
+; CHECK-NEXT:  .LBB25_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -701,13 +732,13 @@ e:
 define <3 x i64> @tv3i64(i1 %c, ptr %p, <3 x i64> %a, <3 x i64> %b) {
 ; CHECK-SD-LABEL: tv3i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB25_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB26_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    fmov d3, d0
 ; CHECK-SD-NEXT:    fmov d4, d1
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:    fmov d5, d2
-; CHECK-SD-NEXT:  .LBB25_2: // %e
+; CHECK-SD-NEXT:  .LBB26_2: // %e
 ; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    fmov d1, d4
 ; CHECK-SD-NEXT:    fmov d2, d5
@@ -717,7 +748,7 @@ define <3 x i64> @tv3i64(i1 %c, ptr %p, <3 x i64> %a, <3 x i64> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB25_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB26_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    fmov d6, d0
 ; CHECK-GI-NEXT:    fmov d7, d2
@@ -729,7 +760,7 @@ define <3 x i64> @tv3i64(i1 %c, ptr %p, <3 x i64> %a, <3 x i64> %b) {
 ; CHECK-GI-NEXT:    mov v0.16b, v6.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
-; CHECK-GI-NEXT:  .LBB25_2:
+; CHECK-GI-NEXT:  .LBB26_2:
 ; CHECK-GI-NEXT:    fmov d0, d3
 ; CHECK-GI-NEXT:    fmov d2, d5
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
@@ -750,12 +781,12 @@ e:
 define <4 x i64> @tv4i64(i1 %c, ptr %p, <4 x i64> %a, <4 x i64> %b) {
 ; CHECK-SD-LABEL: tv4i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB26_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB27_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB26_2: // %e
+; CHECK-SD-NEXT:  .LBB27_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v3.16b
 ; CHECK-SD-NEXT:    ret
@@ -764,12 +795,12 @@ define <4 x i64> @tv4i64(i1 %c, ptr %p, <4 x i64> %a, <4 x i64> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB26_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB27_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v1.16b
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:  .LBB26_2: // %e
+; CHECK-GI-NEXT:  .LBB27_2: // %e
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -785,15 +816,15 @@ e:
 define <2 x i128> @tv2i128(i1 %c, ptr %p, <2 x i128> %a, <2 x i128> %b) {
 ; CHECK-SD-LABEL: tv2i128:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB27_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB28_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov x6, x2
 ; CHECK-SD-NEXT:    mov x7, x3
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:    b .LBB27_3
-; CHECK-SD-NEXT:  .LBB27_2:
+; CHECK-SD-NEXT:    b .LBB28_3
+; CHECK-SD-NEXT:  .LBB28_2:
 ; CHECK-SD-NEXT:    ldp x4, x5, [sp]
-; CHECK-SD-NEXT:  .LBB27_3: // %e
+; CHECK-SD-NEXT:  .LBB28_3: // %e
 ; CHECK-SD-NEXT:    mov x0, x6
 ; CHECK-SD-NEXT:    mov x1, x7
 ; CHECK-SD-NEXT:    mov x2, x4
@@ -802,7 +833,7 @@ define <2 x i128> @tv2i128(i1 %c, ptr %p, <2 x i128> %a, <2 x i128> %b) {
 ;
 ; CHECK-GI-LABEL: tv2i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB27_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB28_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov x9, x2
 ; CHECK-GI-NEXT:    mov x10, x3
@@ -812,7 +843,7 @@ define <2 x i128> @tv2i128(i1 %c, ptr %p, <2 x i128> %a, <2 x i128> %b) {
 ; CHECK-GI-NEXT:    mov x0, x9
 ; CHECK-GI-NEXT:    mov x1, x10
 ; CHECK-GI-NEXT:    ret
-; CHECK-GI-NEXT:  .LBB27_2:
+; CHECK-GI-NEXT:  .LBB28_2:
 ; CHECK-GI-NEXT:    ldp x2, x3, [sp]
 ; CHECK-GI-NEXT:    mov x0, x6
 ; CHECK-GI-NEXT:    mov x1, x7
@@ -827,14 +858,117 @@ e:
     ret <2 x i128> %h
 }
 
+define <2 x ptr> @tv2p0(i1 %c, ptr %p, <2 x ptr> %a, <2 x ptr> %b) {
+; CHECK-LABEL: tv2p0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    tbz w0, #0, .LBB29_2
+; CHECK-NEXT:  // %bb.1: // %t
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    str wzr, [x1]
+; CHECK-NEXT:  .LBB29_2: // %e
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+    br i1 %c, label %t, label %e
+t:
+    store i32 0, ptr %p
+    br label %e
+e:
+    %h = phi <2 x ptr> [%a, %t], [%b, %entry]
+    ret <2 x ptr> %h
+}
+
+define <3 x ptr> @tv3p0(i1 %c, ptr %p, <3 x ptr> %a, <3 x ptr> %b) {
+; CHECK-SD-LABEL: tv3p0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB30_2
+; CHECK-SD-NEXT:  // %bb.1: // %t
+; CHECK-SD-NEXT:    fmov d3, d0
+; CHECK-SD-NEXT:    fmov d4, d1
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:    fmov d5, d2
+; CHECK-SD-NEXT:  .LBB30_2: // %e
+; CHECK-SD-NEXT:    fmov d0, d3
+; CHECK-SD-NEXT:    fmov d1, d4
+; CHECK-SD-NEXT:    fmov d2, d5
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tv3p0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB30_2
+; CHECK-GI-NEXT:  // %bb.1: // %t
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    b .LBB30_3
+; CHECK-GI-NEXT:  .LBB30_2:
+; CHECK-GI-NEXT:    fmov x8, d3
+; CHECK-GI-NEXT:    fmov x9, d5
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:  .LBB30_3: // %e
+; CHECK-GI-NEXT:    mov v1.d[0], x9
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    fmov d1, d2
+; CHECK-GI-NEXT:    fmov d2, x10
+; CHECK-GI-NEXT:    ret
+entry:
+    br i1 %c, label %t, label %e
+t:
+    store i32 0, ptr %p
+    br label %e
+e:
+    %h = phi <3 x ptr> [%a, %t], [%b, %entry]
+    ret <3 x ptr> %h
+}
+
+define <4 x ptr> @tv4p0(i1 %c, ptr %p, <4 x ptr> %a, <4 x ptr> %b) {
+; CHECK-SD-LABEL: tv4p0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB31_2
+; CHECK-SD-NEXT:  // %bb.1: // %t
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:  .LBB31_2: // %e
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tv4p0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v4.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB31_2
+; CHECK-GI-NEXT:  // %bb.1: // %t
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v3.16b, v1.16b
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:  .LBB31_2: // %e
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
+entry:
+    br i1 %c, label %t, label %e
+t:
+    store i32 0, ptr %p
+    br label %e
+e:
+    %h = phi <4 x ptr> [%a, %t], [%b, %entry]
+    ret <4 x ptr> %h
+}
+
 define <2 x half> @tv2f16(i1 %c, ptr %p, <2 x half> %a, <2 x half> %b) {
 ; CHECK-LABEL: tv2f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB28_2
+; CHECK-NEXT:    tbz w0, #0, .LBB32_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB28_2: // %e
+; CHECK-NEXT:  .LBB32_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -850,11 +984,11 @@ e:
 define <3 x half> @tv3f16(i1 %c, ptr %p, <3 x half> %a, <3 x half> %b) {
 ; CHECK-LABEL: tv3f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB29_2
+; CHECK-NEXT:    tbz w0, #0, .LBB33_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB29_2: // %e
+; CHECK-NEXT:  .LBB33_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -870,11 +1004,11 @@ e:
 define <4 x half> @tv4f16(i1 %c, ptr %p, <4 x half> %a, <4 x half> %b) {
 ; CHECK-LABEL: tv4f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB30_2
+; CHECK-NEXT:    tbz w0, #0, .LBB34_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB30_2: // %e
+; CHECK-NEXT:  .LBB34_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -890,11 +1024,11 @@ e:
 define <8 x half> @tv8f16(i1 %c, ptr %p, <8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: tv8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB31_2
+; CHECK-NEXT:    tbz w0, #0, .LBB35_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB31_2: // %e
+; CHECK-NEXT:  .LBB35_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -910,12 +1044,12 @@ e:
 define <16 x half> @tv16f16(i1 %c, ptr %p, <16 x half> %a, <16 x half> %b) {
 ; CHECK-SD-LABEL: tv16f16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB32_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB36_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB32_2: // %e
+; CHECK-SD-NEXT:  .LBB36_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v3.16b
 ; CHECK-SD-NEXT:    ret
@@ -924,12 +1058,12 @@ define <16 x half> @tv16f16(i1 %c, ptr %p, <16 x half> %a, <16 x half> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB32_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB36_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v1.16b
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:  .LBB32_2: // %e
+; CHECK-GI-NEXT:  .LBB36_2: // %e
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -945,11 +1079,11 @@ e:
 define <2 x float> @tv2f32(i1 %c, ptr %p, <2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: tv2f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB33_2
+; CHECK-NEXT:    tbz w0, #0, .LBB37_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB33_2: // %e
+; CHECK-NEXT:  .LBB37_2: // %e
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 entry:
@@ -965,11 +1099,11 @@ e:
 define <3 x float> @tv3f32(i1 %c, ptr %p, <3 x float> %a, <3 x float> %b) {
 ; CHECK-LABEL: tv3f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB34_2
+; CHECK-NEXT:    tbz w0, #0, .LBB38_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB34_2: // %e
+; CHECK-NEXT:  .LBB38_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -985,11 +1119,11 @@ e:
 define <4 x float> @tv4f32(i1 %c, ptr %p, <4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: tv4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB35_2
+; CHECK-NEXT:    tbz w0, #0, .LBB39_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB35_2: // %e
+; CHECK-NEXT:  .LBB39_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -1005,12 +1139,12 @@ e:
 define <8 x float> @tv8f32(i1 %c, ptr %p, <8 x float> %a, <8 x float> %b) {
 ; CHECK-SD-LABEL: tv8f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB36_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB40_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB36_2: // %e
+; CHECK-SD-NEXT:  .LBB40_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v3.16b
 ; CHECK-SD-NEXT:    ret
@@ -1019,12 +1153,12 @@ define <8 x float> @tv8f32(i1 %c, ptr %p, <8 x float> %a, <8 x float> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB36_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB40_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v1.16b
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:  .LBB36_2: // %e
+; CHECK-GI-NEXT:  .LBB40_2: // %e
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1040,11 +1174,11 @@ e:
 define <2 x double> @tv2f64(i1 %c, ptr %p, <2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: tv2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w0, #0, .LBB37_2
+; CHECK-NEXT:    tbz w0, #0, .LBB41_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:  .LBB37_2: // %e
+; CHECK-NEXT:  .LBB41_2: // %e
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -1060,13 +1194,13 @@ e:
 define <3 x double> @tv3f64(i1 %c, ptr %p, <3 x double> %a, <3 x double> %b) {
 ; CHECK-SD-LABEL: tv3f64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB38_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB42_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    fmov d3, d0
 ; CHECK-SD-NEXT:    fmov d4, d1
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:    fmov d5, d2
-; CHECK-SD-NEXT:  .LBB38_2: // %e
+; CHECK-SD-NEXT:  .LBB42_2: // %e
 ; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    fmov d1, d4
 ; CHECK-SD-NEXT:    fmov d2, d5
@@ -1076,7 +1210,7 @@ define <3 x double> @tv3f64(i1 %c, ptr %p, <3 x double> %a, <3 x double> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB38_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB42_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    fmov d6, d0
 ; CHECK-GI-NEXT:    fmov d7, d2
@@ -1088,7 +1222,7 @@ define <3 x double> @tv3f64(i1 %c, ptr %p, <3 x double> %a, <3 x double> %b) {
 ; CHECK-GI-NEXT:    mov v0.16b, v6.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
-; CHECK-GI-NEXT:  .LBB38_2:
+; CHECK-GI-NEXT:  .LBB42_2:
 ; CHECK-GI-NEXT:    fmov d0, d3
 ; CHECK-GI-NEXT:    fmov d2, d5
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
@@ -1109,12 +1243,12 @@ e:
 define <4 x double> @tv4f64(i1 %c, ptr %p, <4 x double> %a, <4 x double> %b) {
 ; CHECK-SD-LABEL: tv4f64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB39_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB43_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB39_2: // %e
+; CHECK-SD-NEXT:  .LBB43_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v3.16b
 ; CHECK-SD-NEXT:    ret
@@ -1123,12 +1257,12 @@ define <4 x double> @tv4f64(i1 %c, ptr %p, <4 x double> %a, <4 x double> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB39_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB43_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov v0.16b, v4.16b
 ; CHECK-GI-NEXT:    mov v3.16b, v1.16b
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:  .LBB39_2: // %e
+; CHECK-GI-NEXT:  .LBB43_2: // %e
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1144,30 +1278,30 @@ e:
 define <2 x fp128> @tv2f128(i1 %c, ptr %p, <2 x fp128> %a, <2 x fp128> %b) {
 ; CHECK-SD-LABEL: tv2f128:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    tbz w0, #0, .LBB40_2
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB44_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
 ; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:  .LBB40_2: // %e
+; CHECK-SD-NEXT:  .LBB44_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v3.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: tv2f128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    tbz w0, #0, .LBB40_2
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB44_2
 ; CHECK-GI-NEXT:  // %bb.1: // %t
 ; CHECK-GI-NEXT:    mov d4, v1.d[1]
 ; CHECK-GI-NEXT:    mov d5, v0.d[1]
 ; CHECK-GI-NEXT:    str wzr, [x1]
 ; CHECK-GI-NEXT:    fmov d2, d0
 ; CHECK-GI-NEXT:    fmov d3, d1
-; CHECK-GI-NEXT:    b .LBB40_3
-; CHECK-GI-NEXT:  .LBB40_2:
+; CHECK-GI-NEXT:    b .LBB44_3
+; CHECK-GI-NEXT:  .LBB44_2:
 ; CHECK-GI-NEXT:    mov d4, v3.d[1]
 ; CHECK-GI-NEXT:    mov d5, v2.d[1]
-; CHECK-GI-NEXT:  .LBB40_3: // %e
+; CHECK-GI-NEXT:  .LBB44_3: // %e
 ; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    fmov x9, d3
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 1fc4de1c48b7d..f9f1cd4b1fcf7 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -113,9 +113,9 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ;
 ; GISEL-LABEL: popcount256:
 ; GISEL:       // %bb.0: // %Entry
-; GISEL-NEXT:    ldp x8, x9, [x0, #16]
+; GISEL-NEXT:    ldp x8, x9, [x0]
 ; GISEL-NEXT:    mov v0.d[0], x8
-; GISEL-NEXT:    ldp x8, x10, [x0]
+; GISEL-NEXT:    ldp x8, x10, [x0, #16]
 ; GISEL-NEXT:    mov v1.d[0], x8
 ; GISEL-NEXT:    mov v0.d[1], x9
 ; GISEL-NEXT:    mov v1.d[1], x10
diff --git a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
new file mode 100644
index 0000000000000..b61fa4be04007
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
@@ -0,0 +1,38 @@
+# RUN: llc -mtriple=aarch64 -verify-machineinstrs -o - -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking %s | FileCheck %s --check-prefix=CHECK
+# RUN: llc -mtriple=aarch64 -verify-machineinstrs -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG
+# REQUIRES: asserts
+
+# CHECK-DBG: ********** REGISTER COALESCER **********
+# CHECK-DBG: ********** Function: test
+# CHECK-DBG: ********** JOINING INTERVALS ***********
+# CHECK-DBG: ********** INTERVALS **********
+# CHECK-DBG: %0 [16r,32r:0) 0@16r  weight:0.000000e+00
+# CHECK-DBG: %3 [48r,112r:0) 0@48r  L0000000000000040 [48r,112r:0) 0@48r  weight:0.000000e+00
+# CHECK-DBG: %4 [80r,112e:1)[112e,112d:0) 0@112e 1@80r  L0000000000000080 [112e,112d:0) 0@112e  L0000000000000040 [80r,112e:1)[112e,112d:0) 0@112e 1@80r  weight:0.000000e+00
+# CHECK-DBG: %5 [32r,112r:1)[112r,112d:0) 0@112r 1@32r  weight:0.000000e+00
+---
+name:            test
+tracksRegLiveness: true
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 65, alignment: 16,
+      stack-id: default }
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test
+    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64common = nuw ADDXri [[ADDXri]], 64, 0
+    ; CHECK-NEXT: undef [[MOVi32imm:%[0-9]+]].sub_32:gpr64 = MOVi32imm 64
+    ; CHECK-NEXT: undef [[MOVi32imm1:%[0-9]+]].sub_32:gpr64 = MOVi32imm 64
+    ; CHECK-NEXT: dead [[ADDXri1]]:gpr64common, dead early-clobber [[MOVi32imm1]]:gpr64 = MOPSMemorySetPseudo [[ADDXri1]], [[MOVi32imm1]], [[MOVi32imm]], implicit-def dead $nzcv
+    ; CHECK-NEXT: RET_ReallyLR
+    %1:gpr64sp = ADDXri %stack.0, 0, 0
+    %2:gpr64common = nuw ADDXri killed %1, 64, 0
+    %3:gpr32 = MOVi32imm 64
+    %4:gpr64 = SUBREG_TO_REG 0, killed %3, %subreg.sub_32
+    %6:gpr64 = COPY %4
+    %5:gpr64common = COPY killed %2
+    dead %5:gpr64common, dead early-clobber %6:gpr64 = MOPSMemorySetPseudo %5, %6, %4, implicit-def dead $nzcv
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index d156ec079ae94..cceeb9f3e830a 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -230,29 +230,34 @@ l2:
   ret <4 x i32> %c
 }
 
-define <4 x float> @fmul(<4 x float> %x, ptr %y) {
+define <4 x float> @fmul(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmul v1.4s, v2.4s, v1.s[3]
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB7_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    subs w8, w8, #1
+; CHECK-NEXT:    ldr q2, [x1, x8]
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    cmp w8, #16
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.s[0]
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    b.eq .LBB7_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
 entry:
-  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %x.val = load float, ptr %x
+  %x.ins = insertelement <4 x float> poison, float %x.val, i64 0
+  %a = shufflevector <4 x float> %x.ins, <4 x float> undef, <4 x i32> zeroinitializer
   br label %l1
 
 l1:
   %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
   %q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
-  %l = load <4 x float>, ptr %y
+  %idx.y = mul nuw nsw i32 %p, 4
+  %ptr.y = getelementptr float, ptr %y, i32 %idx.y
+  %l = load <4 x float>, ptr %ptr.y
   %b = fmul <4 x float> %l, %a
   %c = fadd <4 x float> %b, %q
   %pa = add i32 %p, 1
@@ -270,10 +275,9 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) {
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB8_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fmla v0.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    subs w8, w8, #1
 ; CHECK-NEXT:    b.eq .LBB8_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@@ -418,6 +422,134 @@ l2:
   ret <4 x i32> %r
 }
 
+; We shouldn't sink without fullfp16.
+define <4 x half> @fmul_half(ptr %x, ptr %y) {
+; CHECK-LABEL: fmul_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v1.4h }, [x0]
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:  .LBB13_1: // %l1
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr d2, [x1, x8]
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    add x8, x8, #8
+; CHECK-NEXT:    cmp w8, #8
+; CHECK-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    b.eq .LBB13_1
+; CHECK-NEXT:  // %bb.2: // %l2
+; CHECK-NEXT:    ret
+entry:
+  %x.val = load half, ptr %x
+  %x.ins = insertelement <4 x half> poison, half %x.val, i64 0
+  %a = shufflevector <4 x half> %x.ins, <4 x half> undef, <4 x i32> zeroinitializer
+  br label %l1
+
+l1:
+  %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+  %q = phi <4 x half> [ zeroinitializer, %entry ], [ %c, %l1 ]
+  %idx.y = mul nuw nsw i32 %p, 4
+  %ptr.y = getelementptr half, ptr %y, i32 %idx.y
+  %l = load <4 x half>, ptr %ptr.y
+  %b = fmul <4 x half> %l, %a
+  %c = fadd <4 x half> %b, %q
+  %pa = add i32 %p, 1
+  %c1 = icmp eq i32 %p, 0
+  br i1 %c1, label %l1, label %l2
+
+l2:
+  ret <4 x half> %c
+}
+
+define <4 x half> @fmul_half_fullfp16(ptr %x, ptr %y) "target-features"="+fullfp16" {
+; CHECK-LABEL: fmul_half_fullfp16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB14_1: // %l1
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr d2, [x1, x8]
+; CHECK-NEXT:    add x8, x8, #8
+; CHECK-NEXT:    cmp w8, #8
+; CHECK-NEXT:    fmul v2.4h, v2.4h, v1.h[0]
+; CHECK-NEXT:    fadd v0.4h, v2.4h, v0.4h
+; CHECK-NEXT:    b.eq .LBB14_1
+; CHECK-NEXT:  // %bb.2: // %l2
+; CHECK-NEXT:    ret
+entry:
+  %x.val = load half, ptr %x
+  %x.ins = insertelement <4 x half> poison, half %x.val, i64 0
+  %a = shufflevector <4 x half> %x.ins, <4 x half> undef, <4 x i32> zeroinitializer
+  br label %l1
+
+l1:
+  %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+  %q = phi <4 x half> [ zeroinitializer, %entry ], [ %c, %l1 ]
+  %idx.y = mul nuw nsw i32 %p, 4
+  %ptr.y = getelementptr half, ptr %y, i32 %idx.y
+  %l = load <4 x half>, ptr %ptr.y
+  %b = fmul <4 x half> %l, %a
+  %c = fadd <4 x half> %b, %q
+  %pa = add i32 %p, 1
+  %c1 = icmp eq i32 %p, 0
+  br i1 %c1, label %l1, label %l2
+
+l2:
+  ret <4 x half> %c
+}
+
+; We shouldn't sink the splat operand for scalable vectors.
+define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) "target-features"="+sve" {
+; CHECK-LABEL: fmul_scalable:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    sxtw x8, w8
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    ld1rw { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:  .LBB15_1: // %l1
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    subs w9, w9, #1
+; CHECK-NEXT:    add x1, x1, x8
+; CHECK-NEXT:    fmul z2.s, z2.s, z1.s
+; CHECK-NEXT:    fadd z0.s, z2.s, z0.s
+; CHECK-NEXT:    b.eq .LBB15_1
+; CHECK-NEXT:  // %bb.2: // %l2
+; CHECK-NEXT:    ret
+entry:
+  %x.val = load float, ptr %x
+  %x.ins = insertelement <vscale x 4 x float> poison, float %x.val, i64 0
+  %a = shufflevector <vscale x 4 x float> %x.ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  %33 = tail call i32 @llvm.vscale.i32()
+  %34 = shl nuw nsw i32 %33, 4
+  br label %l1
+
+l1:
+  %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+  %q = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
+  %idx.y = mul nuw nsw i32 %p, %34
+  %ptr.y = getelementptr float, ptr %y, i32 %idx.y
+  %l = load <vscale x 4 x float>, ptr %ptr.y
+  %b = fmul <vscale x 4 x float> %l, %a
+  %c = fadd <vscale x 4 x float> %b, %q
+  %pa = add i32 %p, 1
+  %c1 = icmp eq i32 %p, 0
+  br i1 %c1, label %l1, label %l2
+
+l2:
+  ret <vscale x 4 x float> %c
+}
+
 
 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-fexpa.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-fexpa.ll
new file mode 100644
index 0000000000000..00e000f642377
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-fexpa.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -force-streaming -mattr=+sme2p2 < %s | FileCheck %s
+
+define <vscale x 8 x half> @fexpa_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: fexpa_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fexpa z0.h, z0.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fexpa.x.nxv8f16(<vscale x 8 x i16> %a)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fexpa_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: fexpa_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fexpa z0.s, z0.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fexpa.x.nxv4f32(<vscale x 4 x i32> %a)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fexpa_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: fexpa_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fexpa z0.d, z0.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fexpa.x.nxv2f64(<vscale x 2 x i64> %a)
+  ret <vscale x 2 x double> %out
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith.ll
index a16f230cf8bbb..0aeab72096caa 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith.ll
@@ -299,37 +299,6 @@ define <vscale x 2 x double> @fdivr_d(<vscale x 2 x i1> %pg, <vscale x 2 x doubl
   ret <vscale x 2 x double> %out
 }
 
-;
-; FEXPA
-;
-
-define <vscale x 8 x half> @fexpa_h(<vscale x 8 x i16> %a) {
-; CHECK-LABEL: fexpa_h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fexpa z0.h, z0.h
-; CHECK-NEXT:    ret
-  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fexpa.x.nxv8f16(<vscale x 8 x i16> %a)
-  ret <vscale x 8 x half> %out
-}
-
-define <vscale x 4 x float> @fexpa_s(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: fexpa_s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fexpa z0.s, z0.s
-; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fexpa.x.nxv4f32(<vscale x 4 x i32> %a)
-  ret <vscale x 4 x float> %out
-}
-
-define <vscale x 2 x double> @fexpa_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
-; CHECK-LABEL: fexpa_d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fexpa z0.d, z0.d
-; CHECK-NEXT:    ret
-  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fexpa.x.nxv2f64(<vscale x 2 x i64> %a)
-  ret <vscale x 2 x double> %out
-}
-
 ;
 ; FMAD
 ;
@@ -1668,10 +1637,6 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fdivr.nxv8f16(<vscale x 8 x i1>, <
 declare <vscale x 4 x float> @llvm.aarch64.sve.fdivr.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.fdivr.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
 
-declare <vscale x 8 x half> @llvm.aarch64.sve.fexpa.x.nxv8f16(<vscale x 8 x i16>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.fexpa.x.nxv4f32(<vscale x 4 x i32>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.fexpa.x.nxv2f64(<vscale x 2 x i64>)
-
 declare <vscale x 8 x half> @llvm.aarch64.sve.fmad.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.fmad.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.fmad.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 9055b2efba328..48a642c908bfe 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -2677,7 +2677,7 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
-; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -2750,7 +2750,7 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
-; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
index 5bea13af1649a..37fc0e0282690 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
@@ -31,42 +31,39 @@ define void @main(<19 x i32> %arg) {
 ; GFX10-LABEL: main:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_mov_b32 s10, s4
-; GFX10-NEXT:    s_mov_b32 s11, s4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s10
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v5, s11
-; GFX10-NEXT:    s_mov_b32 s5, s4
 ; GFX10-NEXT:    s_mov_b32 s6, s4
 ; GFX10-NEXT:    s_mov_b32 s7, s4
 ; GFX10-NEXT:    s_mov_b32 s8, s4
 ; GFX10-NEXT:    s_mov_b32 s9, s4
-; GFX10-NEXT:    image_store v[0:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
+; GFX10-NEXT:    s_mov_b32 s10, s4
+; GFX10-NEXT:    s_mov_b32 s11, s4
+; GFX10-NEXT:    image_store v[0:3], [v1, v1], s[4:11] dim:SQ_RSRC_IMG_2D unorm
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: main:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_mov_b32 s6, s0
-; GFX11-NEXT:    s_mov_b32 s7, s0
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s6
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v5, s7
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
 ; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    s_mov_b32 s5, s0
-; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT:    s_mov_b32 s6, s0
+; GFX11-NEXT:    s_mov_b32 s7, s0
+; GFX11-NEXT:    image_store v[0:3], [v1, v1], s[0:7] dim:SQ_RSRC_IMG_2D unorm
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = bitcast <19 x i32> %arg to <38 x i16>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index c9426106af5da..d94bf3af3e2f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -55,13 +55,11 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    v_writelane_b32 v16, s5, 1
 ; CHECK-NEXT:    v_writelane_b32 v16, s6, 2
 ; CHECK-NEXT:    v_writelane_b32 v16, s7, 3
-; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    s_mov_b32 s4, s6
-; CHECK-NEXT:    s_mov_b32 s5, s6
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s5
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s4, exec_lo
 ; CHECK-NEXT:    v_writelane_b32 v16, s4, 4
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
@@ -154,10 +152,10 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    v_readlane_b32 s17, v16, 1
 ; CHECK-NEXT:    v_readlane_b32 s18, v16, 2
 ; CHECK-NEXT:    v_readlane_b32 s19, v16, 3
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; CHECK-NEXT:    image_sample v0, [v0, v1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
index 36ba7c2ecfac3..3b16c77548a23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
@@ -1074,8 +1074,7 @@ define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s,
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ;
@@ -1163,8 +1162,7 @@ define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ;
@@ -1327,8 +1325,7 @@ define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ;
@@ -1416,8 +1413,7 @@ define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ;
@@ -1507,8 +1503,7 @@ define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data,
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ;
@@ -1754,8 +1749,7 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ;
@@ -1851,8 +1845,7 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (volatile dereferenceable load store (s32), addrspace 8)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
index 8e4e4cf2c5b87..ea40703bf98d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
@@ -262,8 +262,7 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -383,8 +382,7 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -604,8 +602,7 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -725,8 +722,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -850,8 +846,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1074,8 +1069,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1199,8 +1193,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1327,8 +1320,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1451,8 +1443,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1576,8 +1567,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10NSA-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1895,8 +1885,7 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_3d
@@ -2016,8 +2005,7 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_cube
@@ -2237,8 +2225,7 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_2darray
@@ -2358,8 +2345,7 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2dmsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2dmsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_2dmsaa
@@ -2483,8 +2469,7 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_2darraymsaa
@@ -2707,8 +2692,7 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_mip_2d
@@ -2832,8 +2816,7 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_mip_3d
@@ -2960,8 +2943,7 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_mip_cube
@@ -3084,8 +3066,7 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_mip_1darray
@@ -3209,8 +3190,7 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 1 :: (dereferenceable store (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: store_mip_2darray
@@ -5165,8 +5145,7 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF1]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -5299,8 +5278,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
   ; GFX10NSA-NEXT:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10NSA-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-  ; GFX10NSA-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10NSA-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10NSA-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA-NEXT:   G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GFX10NSA-NEXT:   $vgpr0 = COPY [[UV]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
index 5b017ad89a0ed..659a8a2ff254b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
@@ -322,8 +322,7 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -358,8 +357,7 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -470,8 +468,7 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -506,8 +503,7 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -747,8 +743,7 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -783,8 +778,7 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -889,8 +883,7 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -922,8 +915,7 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1027,8 +1019,7 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1061,8 +1052,7 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; GFX11-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1300,8 +1290,7 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1336,8 +1325,7 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1444,8 +1432,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1478,8 +1465,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; GFX11-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1738,8 +1724,7 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1772,8 +1757,7 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1882,8 +1866,7 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -1918,8 +1901,7 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -2330,8 +2312,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -2366,8 +2347,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -6020,8 +6000,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -6056,8 +6035,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -6164,8 +6142,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -6198,8 +6175,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; GFX11-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -6710,8 +6686,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -6743,8 +6718,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -6848,8 +6822,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-  ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -6882,8 +6855,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; GFX11-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX11-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-  ; GFX11-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
+  ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 1 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; GFX11-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
index 12234088adca6..4c15ad0355781 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
@@ -87,8 +87,7 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
-  ; GFX10-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (s16), addrspace 8)
+  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (s16), addrspace 8)
   ; GFX10-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: image_store_f16
@@ -198,8 +197,7 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-  ; GFX10-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<2 x s16>), addrspace 8)
+  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<2 x s16>), addrspace 8)
   ; GFX10-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: image_store_v2f16
@@ -330,8 +328,7 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
-  ; GFX10-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
+  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>), align 8, addrspace 8)
   ; GFX10-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: image_store_v3f16
@@ -452,8 +449,7 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
   ; GFX10-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
-  ; GFX10-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
-  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<4 x s16>), addrspace 8)
+  ; GFX10-NEXT:   G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<4 x s16>), addrspace 8)
   ; GFX10-NEXT:   S_ENDPGM 0
   ;
   ; GFX12-LABEL: name: image_store_v4f16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
index a3796197e4c53..fc77da3b1ae03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
@@ -706,12 +706,12 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16
+; GFX10-NEXT:    image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -760,12 +760,12 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16
+; GFX10-NEXT:    image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -868,12 +868,12 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
+; GFX10-NEXT:    image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -922,12 +922,12 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 16, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
+; GFX10-NEXT:    image_atomic_add v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -1762,12 +1762,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16
+; GFX10-NEXT:    image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -1816,12 +1816,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16
+; GFX10-NEXT:    image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -1924,12 +1924,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
+; GFX10-NEXT:    image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -1978,12 +1978,12 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
 ; GFX10-NEXT:    s_mov_b32 s1, s3
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
+; GFX10-NEXT:    image_atomic_add v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
index 916b9c0835d41..ad49a27908173 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -119,9 +119,9 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
+; GFX10NSA-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX10NSA-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -193,9 +193,9 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
+; GFX10NSA-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10NSA-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -341,9 +341,9 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
+; GFX10NSA-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10NSA-NEXT:    image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10NSA-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
@@ -778,7 +778,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s1, s3
 ; GFX10NSA-NEXT:    s_mov_b32 s2, s4
 ; GFX10NSA-NEXT:    s_mov_b32 s3, s5
-; GFX10NSA-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
+; GFX10NSA-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10NSA-NEXT:    s_mov_b32 s4, s6
 ; GFX10NSA-NEXT:    s_mov_b32 s5, s7
 ; GFX10NSA-NEXT:    s_mov_b32 s6, s8
@@ -787,7 +787,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10NSA-NEXT:    s_mov_b32 s9, s11
 ; GFX10NSA-NEXT:    s_mov_b32 s10, s12
 ; GFX10NSA-NEXT:    s_mov_b32 s11, s13
-; GFX10NSA-NEXT:    image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10NSA-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10NSA-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index f5d11fcdff80a..9652917e9028e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -28,12 +28,12 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
 ; GFX10PLUS-NEXT:    s_mov_b32 s1, s3
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, s4
 ; GFX10PLUS-NEXT:    s_mov_b32 s3, s5
-; GFX10PLUS-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
+; GFX10PLUS-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10PLUS-NEXT:    s_mov_b32 s4, s6
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
-; GFX10PLUS-NEXT:    image_load v[0:3], v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
+; GFX10PLUS-NEXT:    image_load v[0:3], [v0, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
@@ -88,55 +88,56 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
 ;
 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_mov_b32_e32 v6, v2
+; GFX10-NEXT:    v_mov_b32_e32 v12, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v7
-; GFX10-NEXT:    v_mov_b32_e32 v9, v7
-; GFX10-NEXT:    v_mov_b32_e32 v10, v7
-; GFX10-NEXT:    v_mov_b32_e32 v11, v7
-; GFX10-NEXT:    v_lshl_or_b32 v5, v1, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v6
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6
+; GFX10-NEXT:    v_mov_b32_e32 v10, v6
+; GFX10-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    v_mov_b32_e32 v4, v11
-; GFX10-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9
+; GFX10-NEXT:    v_mov_b32_e32 v4, v10
+; GFX10-NEXT:    image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v7, v4, s[10:11]
+; GFX10-NEXT:    global_store_dword v6, v4, s[10:11]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_mov_b32 s0, s2
 ; GFX11-NEXT:    s_mov_b32 s1, s3
 ; GFX11-NEXT:    s_mov_b32 s2, s4
-; GFX11-NEXT:    v_mov_b32_e32 v9, v7
-; GFX11-NEXT:    v_mov_b32_e32 v8, v7
-; GFX11-NEXT:    v_mov_b32_e32 v10, v7
-; GFX11-NEXT:    v_mov_b32_e32 v11, v7
-; GFX11-NEXT:    v_lshl_or_b32 v5, v1, 16, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v6
+; GFX11-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-NEXT:    v_mov_b32_e32 v9, v6
+; GFX11-NEXT:    v_mov_b32_e32 v10, v6
+; GFX11-NEXT:    v_mov_b32_e32 v12, v2
+; GFX11-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
 ; GFX11-NEXT:    s_mov_b32 s3, s5
 ; GFX11-NEXT:    s_mov_b32 s4, s6
 ; GFX11-NEXT:    s_mov_b32 s5, s7
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    v_mov_b32_e32 v4, v11
-; GFX11-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
+; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
+; GFX11-NEXT:    v_mov_b32_e32 v4, v10
+; GFX11-NEXT:    image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v7, v4, s[10:11]
+; GFX11-NEXT:    global_store_b32 v6, v4, s[10:11]
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: load_3d_v4f32_xyzw_tfe:
@@ -200,55 +201,56 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ;
 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_mov_b32_e32 v6, v2
+; GFX10-NEXT:    v_mov_b32_e32 v12, v2
 ; GFX10-NEXT:    s_mov_b32 s0, s2
 ; GFX10-NEXT:    s_mov_b32 s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v8, v7
-; GFX10-NEXT:    v_mov_b32_e32 v9, v7
-; GFX10-NEXT:    v_mov_b32_e32 v10, v7
-; GFX10-NEXT:    v_mov_b32_e32 v11, v7
-; GFX10-NEXT:    v_lshl_or_b32 v5, v1, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v6
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6
+; GFX10-NEXT:    v_mov_b32_e32 v10, v6
+; GFX10-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s2, s4
 ; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    v_mov_b32_e32 v4, v11
-; GFX10-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9
+; GFX10-NEXT:    v_mov_b32_e32 v4, v10
+; GFX10-NEXT:    image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v7, v4, s[10:11]
+; GFX10-NEXT:    global_store_dword v6, v4, s[10:11]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_mov_b32 s0, s2
 ; GFX11-NEXT:    s_mov_b32 s1, s3
 ; GFX11-NEXT:    s_mov_b32 s2, s4
-; GFX11-NEXT:    v_mov_b32_e32 v9, v7
-; GFX11-NEXT:    v_mov_b32_e32 v8, v7
-; GFX11-NEXT:    v_mov_b32_e32 v10, v7
-; GFX11-NEXT:    v_mov_b32_e32 v11, v7
-; GFX11-NEXT:    v_lshl_or_b32 v5, v1, 16, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v6
+; GFX11-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-NEXT:    v_mov_b32_e32 v9, v6
+; GFX11-NEXT:    v_mov_b32_e32 v10, v6
+; GFX11-NEXT:    v_mov_b32_e32 v12, v2
+; GFX11-NEXT:    v_lshl_or_b32 v11, v1, 16, v0
 ; GFX11-NEXT:    s_mov_b32 s3, s5
 ; GFX11-NEXT:    s_mov_b32 s4, s6
 ; GFX11-NEXT:    s_mov_b32 s5, s7
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    v_mov_b32_e32 v4, v11
-; GFX11-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
+; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT:    v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
+; GFX11-NEXT:    v_mov_b32_e32 v4, v10
+; GFX11-NEXT:    image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v7, v4, s[10:11]
+; GFX11-NEXT:    global_store_b32 v6, v4, s[10:11]
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 613c73f7b9368..14b30e0d79946 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1178,212 +1178,212 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v14
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v12, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v17, v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v18, v13
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v9, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v16
+; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    s_mov_b32 s6, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9]
-; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v12, v14, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v9
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v3, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v17, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v16, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v6
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v8
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -1392,23 +1392,23 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v7
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index d5e22df59ccb3..ee7a040e41fd5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -1106,210 +1106,210 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0xfffff000
+; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v14
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x1000
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v8, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v4
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v15, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v18, v13
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    s_mov_b32 s6, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v11, v14, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v9
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v3, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v6
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v8
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1321,18 +1321,18 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1699,210 +1699,210 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v14
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v8, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v4
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v15, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v18, v13
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    s_mov_b32 s6, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v11, v14, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v9
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v3, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v6
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v8
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1914,18 +1914,18 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -3194,59 +3194,59 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v17, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v4
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], 0, v12
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v17, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v4
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 0, v12
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v14, v0
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v10, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v5
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v14, v4
-; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v17, v0, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v5
+; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
+; GISEL-NEXT:    v_mul_hi_u32 v1, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v0, v6
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1]
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v4, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[1:2]
+; GISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 0, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v8, v4
 ; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
@@ -3274,7 +3274,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0, v3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index a7e5ce3d21619..faad7e93da5d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -1095,192 +1095,189 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT:    s_sub_u32 s4, 0, 0x12d8fb
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
+; GISEL-NEXT:    s_mov_b32 s4, 1
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0xffed2705
+; GISEL-NEXT:    s_mov_b32 s5, 1
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GISEL-NEXT:    s_subb_u32 s4, 0, 0
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_subb_u32 s5, 0, 0
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, s4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s5, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, s4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, s6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v7
-; GISEL-NEXT:    v_mul_hi_u32 v17, v5, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT:    v_mul_lo_u32 v18, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v19, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v20, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v12, s5, v6
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v20
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v19, v16
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v17, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v5, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, s4, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, s5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v13, s4, v9
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v15, s4, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v18, s6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v19, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v20, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GISEL-NEXT:    v_mul_lo_u32 v18, v5, v12
-; GISEL-NEXT:    v_mul_lo_u32 v21, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v22, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v19, v11
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v20, v15
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v11
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v7, v10, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v13, s4, v11
+; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s5, v6
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v16, v10, v5
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v12
+; GISEL-NEXT:    v_mul_hi_u32 v18, v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v20, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
+; GISEL-NEXT:    v_mul_hi_u32 v15, v11, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v9
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v21, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v22, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v21, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v20
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v22
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v17
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v16
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v21, v18
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v1, v11
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v5
+; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v5
+; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v16
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v17, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v17, v8
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v9, v9, v4
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v12
 ; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v12
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v8
 ; GISEL-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
index f6c36b32bb13e..23a7bb6ece488 100644
--- a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
+++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
@@ -6,13 +6,11 @@ define float @test() {
   ; GFX10: bb.0.bb:
   ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7
-  ; GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX10-NEXT:   [[IMAGE_LOAD_V2_V2_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
-  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_gfx10_]].sub1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_gfx10_]].sub0
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX10-NEXT:   [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[COPY]], [[COPY1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
   ; GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec
   ; GFX10-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
   ; GFX10-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_1]]
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 1c9f35dd45fee..135efceb31fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s
 
 ; TODO: Add global-isel when it can support bf16
 
@@ -23,139 +24,168 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
 }
 
 define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
-; GCN-LABEL: v_test_cvt_v2f32_v2bf16_v:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v2, v2, v0, s0
-; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GCN-NEXT:    v_add3_u32 v2, v2, v1, s0
-; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GCN-NEXT:    s_mov_b32 s0, 0x7060302
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_v:
+; GFX-940:       ; %bb.0:
+; GFX-940-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GFX-940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX-940-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX-940-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GFX-940-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX-940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX-940-NEXT:    s_nop 0
+; GFX-940-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX-940-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX-940-NEXT:    ; return to shader part epilog
+;
+; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_v:
+; GFX-950:       ; %bb.0:
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX-950-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x float> %src to <2 x bfloat>
   %cast = bitcast <2 x bfloat> %res to float
   ret float %cast
 }
 
 define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
-; GCN-LABEL: v_test_cvt_v2f32_v2bf16_s:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bfe_u32 s2, s1, 0x10010
-; GCN-NEXT:    s_add_i32 s2, s2, s1
-; GCN-NEXT:    s_or_b32 s4, s1, 0x400000
-; GCN-NEXT:    s_add_i32 s5, s2, 0x7fff
-; GCN-NEXT:    v_cmp_u_f32_e64 s[2:3], s1, s1
-; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; GCN-NEXT:    s_cselect_b32 s1, s4, s5
-; GCN-NEXT:    s_lshr_b32 s2, s1, 16
-; GCN-NEXT:    s_bfe_u32 s1, s0, 0x10010
-; GCN-NEXT:    s_add_i32 s1, s1, s0
-; GCN-NEXT:    s_or_b32 s3, s0, 0x400000
-; GCN-NEXT:    s_add_i32 s4, s1, 0x7fff
-; GCN-NEXT:    v_cmp_u_f32_e64 s[0:1], s0, s0
-; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GCN-NEXT:    s_cselect_b32 s0, s3, s4
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX-940-LABEL: v_test_cvt_v2f32_v2bf16_s:
+; GFX-940:       ; %bb.0:
+; GFX-940-NEXT:    s_bfe_u32 s2, s1, 0x10010
+; GFX-940-NEXT:    s_add_i32 s2, s2, s1
+; GFX-940-NEXT:    s_or_b32 s4, s1, 0x400000
+; GFX-940-NEXT:    s_add_i32 s5, s2, 0x7fff
+; GFX-940-NEXT:    v_cmp_u_f32_e64 s[2:3], s1, s1
+; GFX-940-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX-940-NEXT:    s_cselect_b32 s1, s4, s5
+; GFX-940-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX-940-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GFX-940-NEXT:    s_add_i32 s1, s1, s0
+; GFX-940-NEXT:    s_or_b32 s3, s0, 0x400000
+; GFX-940-NEXT:    s_add_i32 s4, s1, 0x7fff
+; GFX-940-NEXT:    v_cmp_u_f32_e64 s[0:1], s0, s0
+; GFX-940-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX-940-NEXT:    s_cselect_b32 s0, s3, s4
+; GFX-940-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX-940-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX-940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX-940-NEXT:    ; return to shader part epilog
+;
+; GFX-950-LABEL: v_test_cvt_v2f32_v2bf16_s:
+; GFX-950:       ; %bb.0:
+; GFX-950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, s0, v0
+; GFX-950-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x float> %src to <2 x bfloat>
   %cast = bitcast <2 x bfloat> %res to float
   ret float %cast
 }
 
 define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
-; GCN-LABEL: v_test_cvt_f32_bf16_v:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v1, v1, v0, s0
-; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX-940-LABEL: v_test_cvt_f32_bf16_v:
+; GFX-940:       ; %bb.0:
+; GFX-940-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX-940-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX-940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX-940-NEXT:    ; return to shader part epilog
+;
+; GFX-950-LABEL: v_test_cvt_f32_bf16_v:
+; GFX-950:       ; %bb.0:
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX-950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX-950-NEXT:    ; return to shader part epilog
   %trunc = fptrunc float %src to bfloat
   %ext = fpext bfloat %trunc to float
   ret float %ext
 }
 
 define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
-; GCN-LABEL: v_test_cvt_v2f64_v2bf16_v:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GCN-NEXT:    v_and_b32_e32 v7, 1, v6
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v4, v6, v4
-; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT:    s_brev_b32 s4, 1
-; GCN-NEXT:    v_and_or_b32 v5, v1, s4, v4
-; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
-; GCN-NEXT:    s_movk_i32 s5, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v5, s5
-; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
-; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
-; GCN-NEXT:    v_and_b32_e32 v6, 1, v5
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
-; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v0, v5, v0
-; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN-NEXT:    v_and_or_b32 v1, v3, s4, v0
-; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 1
-; GCN-NEXT:    v_add3_u32 v0, v0, v1, s5
-; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
-; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
-; GCN-NEXT:    s_mov_b32 s0, 0x7060302
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX-940-LABEL: v_test_cvt_v2f64_v2bf16_v:
+; GFX-940:       ; %bb.0:
+; GFX-940-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX-940-NEXT:    v_and_b32_e32 v7, 1, v6
+; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-940-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX-940-NEXT:    s_brev_b32 s4, 1
+; GFX-940-NEXT:    v_and_or_b32 v5, v1, s4, v4
+; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s5
+; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX-940-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
+; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; GFX-940-NEXT:    v_and_b32_e32 v6, 1, v5
+; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
+; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
+; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX-940-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GFX-940-NEXT:    v_add_u32_e32 v0, v5, v0
+; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX-940-NEXT:    v_and_or_b32 v1, v3, s4, v0
+; GFX-940-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; GFX-940-NEXT:    v_add3_u32 v0, v0, v1, s5
+; GFX-940-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX-940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; GFX-940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX-940-NEXT:    s_nop 0
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX-940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX-940-NEXT:    ; return to shader part epilog
+;
+; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
+; GFX-950:       ; %bb.0:
+; GFX-950-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
+; GFX-950-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX-950-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x double> %src to <2 x bfloat>
   %cast = bitcast <2 x bfloat> %res to float
   ret float %cast
 }
 
 define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
-; GCN-LABEL: fptrunc_f32_f32_to_v2bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v2, v2, v0, s0
-; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GCN-NEXT:    v_add3_u32 v2, v2, v1, s0
-; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GCN-NEXT:    s_mov_b32 s0, 0x7060302
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GFX-940-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX-940-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX-940-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GFX-940-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX-940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX-940-NEXT:    s_nop 0
+; GFX-940-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX-940-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX-940-NEXT:    ; return to shader part epilog
+;
+; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX-950-NEXT:    ; return to shader part epilog
 entry:
   %a.cvt = fptrunc float %a to bfloat
   %b.cvt = fptrunc float %b to bfloat
@@ -166,26 +196,31 @@ entry:
 }
 
 define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
-; GCN-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
-; GCN-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v3, v3, v2, s0
-; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v2
-; GCN-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
-; GCN-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GCN-NEXT:    v_add3_u32 v3, v3, v2, s0
-; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v2
-; GCN-NEXT:    v_cmp_u_f32_e64 vcc, |v1|, |v1|
-; GCN-NEXT:    s_mov_b32 s0, 0x7060302
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX-940-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; GFX-940-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v3, v3, v2, s0
+; GFX-940-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX-940-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
+; GFX-940-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX-940-NEXT:    v_add3_u32 v3, v3, v2, s0
+; GFX-940-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, |v1|, |v1|
+; GFX-940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX-940-NEXT:    s_nop 0
+; GFX-940-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX-940-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX-940-NEXT:    ; return to shader part epilog
+;
+; GFX-950-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, |v1|
+; GFX-950-NEXT:    ; return to shader part epilog
 entry:
   %a.neg = fneg float %a
   %a.cvt = fptrunc float %a.neg to bfloat
@@ -198,19 +233,27 @@ entry:
 }
 
 define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
-; GCN-LABEL: fptrunc_f32_to_bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v1, v1, v0, s0
-; GCN-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f32_to_bf16:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-940-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX-940-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f32_to_bf16:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX-950-NEXT:    flat_store_short v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
   %a.cvt = fptrunc float %a to bfloat
   store bfloat %a.cvt, ptr %out
@@ -218,20 +261,28 @@ entry:
 }
 
 define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
-; GCN-LABEL: fptrunc_f32_to_bf16_abs:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
-; GCN-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v1, s0
-; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
-; GCN-NEXT:    v_cmp_u_f32_e64 vcc, |v0|, |v0|
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f32_to_bf16_abs:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-940-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
+; GFX-940-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GFX-940-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, |v0|, |v0|
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f32_to_bf16_abs:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0
+; GFX-950-NEXT:    flat_store_short v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
   %a.abs = call float @llvm.fabs.f32(float %a)
   %a.cvt = fptrunc float %a.abs to bfloat
@@ -240,20 +291,28 @@ entry:
 }
 
 define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
-; GCN-LABEL: fptrunc_f32_to_bf16_neg:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
-; GCN-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v1, s0
-; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
-; GCN-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f32_to_bf16_neg:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-940-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GFX-940-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GFX-940-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f32_to_bf16_neg:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0
+; GFX-950-NEXT:    flat_store_short v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
   %a.neg = fneg float %a
   %a.cvt = fptrunc float %a.neg to bfloat
@@ -262,29 +321,36 @@ entry:
 }
 
 define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
-; GCN-LABEL: fptrunc_f64_to_bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GCN-NEXT:    v_and_b32_e32 v7, 1, v6
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v4, v6, v4
-; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT:    s_brev_b32 s0, 1
-; GCN-NEXT:    v_and_or_b32 v5, v1, s0, v4
-; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
-; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f64_to_bf16:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX-940-NEXT:    v_and_b32_e32 v7, 1, v6
+; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-940-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX-940-NEXT:    s_brev_b32 s0, 1
+; GFX-940-NEXT:    v_and_or_b32 v5, v1, s0, v4
+; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f64_to_bf16:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX-950-NEXT:    flat_store_short v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
   %a.cvt = fptrunc double %a to bfloat
   store bfloat %a.cvt, ptr %out
@@ -292,30 +358,37 @@ entry:
 }
 
 define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
-; GCN-LABEL: fptrunc_f64_to_bf16_neg:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
-; GCN-NEXT:    v_and_b32_e32 v8, 1, v7
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v4, v7, v4
-; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT:    s_brev_b32 s4, 1
-; GCN-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GCN-NEXT:    v_and_or_b32 v5, v6, s4, v4
-; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
-; GCN-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f64_to_bf16_neg:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GFX-940-NEXT:    v_and_b32_e32 v8, 1, v7
+; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-940-NEXT:    v_add_u32_e32 v4, v7, v4
+; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-940-NEXT:    s_brev_b32 s4, 1
+; GFX-940-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX-940-NEXT:    v_and_or_b32 v5, v6, s4, v4
+; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-940-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f64_to_bf16_neg:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_cvt_f32_f64_e64 v0, -v[0:1]
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX-950-NEXT:    flat_store_short v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
   %a.neg = fneg double %a
   %a.cvt = fptrunc double %a.neg to bfloat
@@ -324,30 +397,37 @@ entry:
 }
 
 define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
-; GCN-LABEL: fptrunc_f64_to_bf16_abs:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
-; GCN-NEXT:    v_and_b32_e32 v8, 1, v7
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v4, v7, v4
-; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GCN-NEXT:    s_brev_b32 s0, 1
-; GCN-NEXT:    v_and_or_b32 v5, v6, s0, v4
-; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
-; GCN-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f64_to_bf16_abs:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GFX-940-NEXT:    v_and_b32_e32 v8, 1, v7
+; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-940-NEXT:    v_add_u32_e32 v4, v7, v4
+; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-940-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX-940-NEXT:    s_brev_b32 s0, 1
+; GFX-940-NEXT:    v_and_or_b32 v5, v6, s0, v4
+; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-940-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_cvt_f32_f64_e64 v0, |v[0:1]|
+; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX-950-NEXT:    flat_store_short v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
   %a.abs = call double @llvm.fabs.f64(double %a)
   %a.cvt = fptrunc double %a.abs to bfloat
diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
index 4eac26e853c2a..b64968c9336b9 100644
--- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -80,6 +80,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX942-NOXNACK %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX942-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX950-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX950-XNACK %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX1010 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX1010-NOXNACK %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX1010-XNACK %s
@@ -180,6 +183,9 @@
 ; GFX942: .amdgcn_target "amdgcn-amd-amdhsa--gfx942"
 ; GFX942-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack-"
 ; GFX942-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack+"
+; GFX950: .amdgcn_target "amdgcn-amd-amdhsa--gfx950"
+; GFX950-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack-"
+; GFX950-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack+"
 ; GFX1010: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010"
 ; GFX1010-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack-"
 ; GFX1010-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack+"
diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index f1f4edb94a617..99344f16d4cd6 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -57,6 +57,7 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX940 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx941 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX941 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx942 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX942 %s
+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX950 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1010 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1011 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1012 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1012 %s
@@ -139,6 +140,7 @@
 ; GFX940:        EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40)
 ; GFX941:        EF_AMDGPU_MACH_AMDGCN_GFX941 (0x4B)
 ; GFX942:        EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C)
+; GFX950:        EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
 ; GFX1010:       EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
 ; GFX1011:       EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34)
 ; GFX1012:       EF_AMDGPU_MACH_AMDGCN_GFX1012 (0x35)
diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
index 961b89ab28f62..3ad2a9df764be 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
@@ -12,6 +12,9 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s
 
+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s
+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s
+
 ; NO-SRAM-ECC-GFX906:      Flags [
 ; NO-SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_FEATURE_XNACK_V3   (0x100)
 ; NO-SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
@@ -44,6 +47,11 @@
 ; SRAM-ECC-GFX940:    EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40)
 ; SRAM-ECC-GFX940:  ]
 
+; SRAM-ECC-GFX950: Flags [
+; SRAM-ECC-GFX950:    EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200)
+; SRAM-ECC-GFX950:    EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
+; SRAM-ECC-GFX950:  ]
+
 define amdgpu_kernel void @elf_header() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index 13640b74a7937..318ecd16a2ccb 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
 
@@ -17,6 +19,11 @@
 ; GFX11-MESA: .long 45100
 ; GFX11-MESA-NEXT: .long 1024
 
+; GFX950-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+
+; GFX950-MESA: .long 45100
+; GFX950-MESA-NEXT: .long 512
+
 ; GFX1200-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400
 
 ; GFX1200-MESA: .long 45100
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 27282a453075b..08122cd0d89ea 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s
 
 define float @v_fmaximum3_f32(float %a, float %b, float %c) {
 ; GFX12-LABEL: v_fmaximum3_f32:
@@ -19,9 +20,11 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
@@ -46,9 +49,11 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
@@ -71,10 +76,13 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre
 ; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
@@ -101,9 +109,11 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
@@ -129,9 +139,11 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e64 v3, v0, |v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call float @llvm.fabs.f32(float %b)
@@ -157,9 +169,11 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -185,9 +199,11 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, |v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
@@ -215,9 +231,11 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e64 v3, -v0, -v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
@@ -245,9 +263,11 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e64 v3, -|v0|, -|v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v1, v0, -|v2|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
@@ -278,9 +298,11 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e64 v3, -v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
@@ -306,9 +328,11 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e64 v3, v0, -v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg float %b
@@ -334,9 +358,11 @@ define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg float %c
@@ -362,9 +388,11 @@ define float @v_fmaximum3_f32_const0(float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e32 v2, 0x41000000, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float 8.0, float %b)
@@ -389,9 +417,11 @@ define float @v_fmaximum3_f32__const2(float %a, float %b) {
 ; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
@@ -416,9 +446,11 @@ define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
 ; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float 4.0, float %b)
@@ -443,9 +475,11 @@ define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
 ; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, 4.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
@@ -472,9 +506,11 @@ define float @v_fmaximum3_f32_const1_const2(float %a) {
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x41800000, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float 8.0)
@@ -500,15 +536,19 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
 ; GFX9-NEXT:    v_max_f32_e32 v6, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v2
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, v4, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, v5, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
@@ -534,15 +574,19 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
 ; GFX9-NEXT:    v_max_f32_e32 v6, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v2
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, v0, v4
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, v1, v5
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
@@ -568,15 +612,19 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
 ; GFX9-NEXT:    v_max_f32_e64 v6, |v1|, |v3|
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v3|
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, |v2|
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v2, v0, |v4|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v2, v1, |v5|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
@@ -605,15 +653,19 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
 ; GFX9-NEXT:    v_max_f32_e64 v6, -v1, -v3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v3, -v0, -v2
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v2, v0, -v4
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v2, v1, -v5
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x float> %a
@@ -642,15 +694,19 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
 ; GFX9-NEXT:    v_max_f32_e32 v4, 2.0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v4, 2.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
@@ -676,15 +732,19 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
 ; GFX9-NEXT:    v_max_f32_e32 v4, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v2
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
@@ -711,21 +771,27 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
 ; GFX9-NEXT:    v_max_f32_e32 v9, v2, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v5, v1, v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v4, v0, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v6, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v7, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v8, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
@@ -752,21 +818,27 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
 ; GFX9-NEXT:    v_max_f32_e32 v9, v2, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v5, v1, v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v4, v0, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v6
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v1, v7
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v2, v8
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
@@ -793,21 +865,27 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
 ; GFX9-NEXT:    v_max_f32_e64 v9, |v2|, |v5|
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v2|, |v5|
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v5, |v1|, |v4|
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v4|
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v4, |v0|, |v3|
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v3|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v3, v0, |v6|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v3, v1, |v7|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v3, v2, |v8|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
@@ -837,21 +915,27 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
 ; GFX9-NEXT:    v_max_f32_e64 v9, -v2, -v5
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v2, -v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v5, -v1, -v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v4, -v0, -v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v3, v0, -v6
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v3, v1, -v7
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e64 v3, v2, -v8
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x float> %a
@@ -881,21 +965,27 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
 ; GFX9-NEXT:    v_max_f32_e32 v6, 2.0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v6, 2.0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v6, 2.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v6, v0, v3
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, v2, v5
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
@@ -922,21 +1012,27 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
 ; GFX9-NEXT:    v_max_f32_e32 v6, v2, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v5, v1, v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v4, v0, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, 4.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, 4.0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v3, 4.0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
@@ -962,9 +1058,11 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -989,9 +1087,11 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, v2, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -1016,11 +1116,14 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
 ; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -1048,9 +1151,11 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
@@ -1076,9 +1181,11 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e64 v3, v0, |v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call half @llvm.fabs.f16(half %b)
@@ -1104,9 +1211,11 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1132,9 +1241,11 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, |v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
@@ -1162,9 +1273,11 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e64 v3, -v0, -v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
@@ -1192,9 +1305,11 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e64 v3, -|v0|, -|v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e64 v1, v0, -|v2|
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
@@ -1225,9 +1340,11 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e64 v3, -v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
@@ -1253,9 +1370,11 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e64 v3, v0, -v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg half %b
@@ -1281,9 +1400,11 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg half %c
@@ -1309,9 +1430,11 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e32 v2, 0x4800, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half 8.0, half %b)
@@ -1336,9 +1459,11 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
 ; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -1363,9 +1488,11 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e32 v2, 4.0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half 4.0, half %b)
@@ -1390,9 +1517,11 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
 ; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, 4.0, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -1419,9 +1548,11 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
 ; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, 0x4c00, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half 8.0)
@@ -1448,19 +1579,23 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
 ; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v2, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0)
@@ -1486,19 +1621,23 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
 ; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -1527,22 +1666,25 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
 ; GFX9-NEXT:    v_pk_max_f16 v3, v3, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
-; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
   %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
@@ -1571,19 +1713,23 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x half> %a
   %b.fneg = fneg <2 x half> %b
@@ -1610,21 +1756,25 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -1650,19 +1800,23 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
 ; GFX9-NEXT:    v_pk_max_f16 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v0, v4, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
@@ -1690,29 +1844,35 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
 ; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v5, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0)
@@ -1740,29 +1900,35 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
 ; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -1799,33 +1965,37 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_pk_max_f16 v7, v7, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
-; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v6, v6, v10
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
   %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
@@ -1856,29 +2026,35 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x half> %a
   %b.fneg = fneg <3 x half> %b
@@ -1907,29 +2083,34 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v7, v1, 2.0
+; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    s_movk_i32 s0, 0x7e00
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s5
+; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s1
 ; GFX9-NEXT:    v_pk_max_f16 v4, v4, v2
-; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s4
+; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s0
 ; GFX9-NEXT:    v_pk_max_f16 v7, v7, v3
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s5
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -1957,29 +2138,35 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
 ; GFX9-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, 4.0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
-; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
@@ -2007,33 +2194,41 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v5, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v4, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0)
@@ -2061,33 +2256,41 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
 ; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v5
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c)
@@ -2124,37 +2327,43 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v1
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_pk_max_f16 v7, v7, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v5
-; GFX9-NEXT:    v_perm_b32 v2, v8, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v1, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v11
-; GFX9-NEXT:    v_perm_b32 v6, v9, v0, s4
+; GFX9-NEXT:    v_perm_b32 v6, v9, v0, s0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_pk_max_f16 v6, v6, v10
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
   %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
@@ -2185,33 +2394,41 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
 ; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <4 x half> %a
   %b.fneg = fneg <4 x half> %b
@@ -2240,35 +2457,41 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v4, v8, v1, s4
+; GFX9-NEXT:    v_perm_b32 v4, v8, v1, s0
 ; GFX9-NEXT:    v_pk_max_f16 v4, v4, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_perm_b32 v8, v5, v0, s4
+; GFX9-NEXT:    v_perm_b32 v8, v5, v0, s0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX9-NEXT:    v_pk_max_f16 v8, v8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_perm_b32 v1, v7, v1, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v7, v1, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>)
   %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c)
@@ -2296,33 +2519,41 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
 ; GFX9-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v1, v4, s4
+; GFX9-NEXT:    v_perm_b32 v2, v1, v4, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s0
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>)
@@ -2346,12 +2577,14 @@ define double @v_fmaximum3_f64(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2377,12 +2610,14 @@ define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2404,19 +2639,20 @@ define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, do
 ;
 ; GFX9-LABEL: s_fmaximum3_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX9-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
@@ -2447,12 +2683,14 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], |v[0:1]|, v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2479,12 +2717,14 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], |v[2:3]|
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2511,12 +2751,14 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], |v[4:5]|
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2543,12 +2785,14 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], |v[0:1]|, |v[2:3]|
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], |v[4:5]|
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2577,12 +2821,14 @@ define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], -v[0:1], -v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2611,12 +2857,14 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -|v[4:5]|
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2648,12 +2896,14 @@ define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], -v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2680,12 +2930,14 @@ define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], -v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2712,12 +2964,14 @@ define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2743,15 +2997,17 @@ define double @v_fmaximum3_f64_const0(double %b, double %c) {
 ; GFX9-LABEL: v_fmaximum3_f64_const0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], s[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40200000
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2777,14 +3033,15 @@ define double @v_fmaximum3_f64__const2(double %a, double %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40200000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[0:1]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2810,12 +3067,14 @@ define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], 4.0
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2841,12 +3100,14 @@ define double @v_fmaximum3_f64__inlineimm(double %a, double %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], 4.0
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2871,17 +3132,18 @@ define double @v_fmaximum3_f64_const1_const2(double %a) {
 ; GFX9-LABEL: v_fmaximum3_f64_const1_const2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40200000
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40300000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40300000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[0:1]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2909,9 +3171,11 @@ define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c)
 ; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
@@ -2935,11 +3199,14 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float
 ; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
@@ -2973,9 +3240,11 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3002,11 +3271,13 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in
 ; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
@@ -3043,19 +3314,23 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b,
 ; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v5, s4
+; GFX9-NEXT:    v_perm_b32 v0, v1, v5, s0
 ; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v1, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v5, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.f16(<2 x half> %max0, <2 x half> %c)
@@ -3080,12 +3355,14 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3095,3 +3372,6 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double
   %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1
   ret <2 x double> %insert.1
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX940: {{.*}}
+; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index d9ba2de48bb01..43293512c8c21 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s
 
 define float @v_fminimum3_f32(float %a, float %b, float %c) {
 ; GFX12-LABEL: v_fminimum3_f32:
@@ -19,9 +20,11 @@ define float @v_fminimum3_f32(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
@@ -46,9 +49,11 @@ define float @v_fminimum3_f32_commute(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, v2, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
@@ -71,10 +76,13 @@ define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inre
 ; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
@@ -101,9 +109,11 @@ define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
@@ -129,9 +139,11 @@ define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e64 v3, v0, |v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call float @llvm.fabs.f32(float %b)
@@ -157,9 +169,11 @@ define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -185,9 +199,11 @@ define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, |v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
@@ -215,9 +231,11 @@ define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e64 v3, -v0, -v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
@@ -245,9 +263,11 @@ define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e64 v3, -|v0|, -|v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v1, v0, -|v2|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
@@ -278,9 +298,11 @@ define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e64 v3, -v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
@@ -306,9 +328,11 @@ define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e64 v3, v0, -v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg float %b
@@ -334,9 +358,11 @@ define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg float %c
@@ -362,9 +388,11 @@ define float @v_fminimum3_f32_const0(float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e32 v2, 0x41000000, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float 8.0, float %b)
@@ -389,9 +417,11 @@ define float @v_fminimum3_f32__const2(float %a, float %b) {
 ; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
@@ -416,9 +446,11 @@ define float @v_fminimum3_f32_inlineimm0(float %b, float %c) {
 ; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float 4.0, float %b)
@@ -443,9 +475,11 @@ define float @v_fminimum3_f32__inlineimm(float %a, float %b) {
 ; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, 4.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
@@ -472,9 +506,11 @@ define float @v_fminimum3_f32_const1_const2(float %a) {
 ; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, 0x41800000, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float 8.0)
@@ -500,15 +536,19 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
 ; GFX9-NEXT:    v_min_f32_e32 v6, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v2
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, v4, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, v5, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
@@ -534,15 +574,19 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
 ; GFX9-NEXT:    v_min_f32_e32 v6, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v2
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, v0, v4
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, v1, v5
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
@@ -568,15 +612,19 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
 ; GFX9-NEXT:    v_min_f32_e64 v6, |v1|, |v3|
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v3|
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, |v2|
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v2, v0, |v4|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v2, v1, |v5|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
@@ -605,15 +653,19 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
 ; GFX9-NEXT:    v_min_f32_e64 v6, -v1, -v3
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v3, -v0, -v2
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v2, v0, -v4
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v2, v1, -v5
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x float> %a
@@ -642,15 +694,19 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
 ; GFX9-NEXT:    v_min_f32_e32 v4, 2.0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v4, 2.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
@@ -676,15 +732,19 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
 ; GFX9-NEXT:    v_min_f32_e32 v4, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v2
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
@@ -711,21 +771,27 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
 ; GFX9-NEXT:    v_min_f32_e32 v9, v2, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v5, v1, v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v4, v0, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v6, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v7, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v8, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
@@ -752,21 +818,27 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
 ; GFX9-NEXT:    v_min_f32_e32 v9, v2, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v5, v1, v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v4, v0, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v6
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v1, v7
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v2, v8
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
@@ -793,21 +865,27 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
 ; GFX9-NEXT:    v_min_f32_e64 v9, |v2|, |v5|
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v2|, |v5|
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v5, |v1|, |v4|
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v4|
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v4, |v0|, |v3|
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v3|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v3, v0, |v6|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v3, v1, |v7|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v3, v2, |v8|
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
@@ -837,21 +915,27 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
 ; GFX9-NEXT:    v_min_f32_e64 v9, -v2, -v5
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v2, -v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v5, -v1, -v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v4, -v0, -v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v3, v0, -v6
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v3, v1, -v7
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e64 v3, v2, -v8
 ; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x float> %a
@@ -881,21 +965,27 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
 ; GFX9-NEXT:    v_min_f32_e32 v6, 2.0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v6, 2.0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v6, 2.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v6, v0, v3
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, v2, v5
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
@@ -922,21 +1012,27 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
 ; GFX9-NEXT:    v_min_f32_e32 v6, v2, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v5, v1, v4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v4, v0, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, 4.0, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, 4.0, v1
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v3, 4.0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
@@ -962,9 +1058,11 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
@@ -989,9 +1087,11 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, v2, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
@@ -1016,11 +1116,14 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %
 ; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
@@ -1048,9 +1151,11 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
@@ -1076,9 +1181,11 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e64 v3, v0, |v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call half @llvm.fabs.f16(half %b)
@@ -1104,9 +1211,11 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1132,9 +1241,11 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, |v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
@@ -1162,9 +1273,11 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e64 v3, -v0, -v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
@@ -1192,9 +1305,11 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e64 v3, -|v0|, -|v1|
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e64 v1, v0, -|v2|
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
@@ -1225,9 +1340,11 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e64 v3, -v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
@@ -1253,9 +1370,11 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e64 v3, v0, -v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg half %b
@@ -1281,9 +1400,11 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg half %c
@@ -1309,9 +1430,11 @@ define half @v_fminimum3_f16_const0(half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e32 v2, 0x4800, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half 8.0, half %b)
@@ -1336,9 +1459,11 @@ define half @v_fminimum3_f16__const2(half %a, half %b) {
 ; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
@@ -1363,9 +1488,11 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e32 v2, 4.0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half 4.0, half %b)
@@ -1390,9 +1517,11 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
 ; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, 4.0, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
@@ -1419,9 +1548,11 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
 ; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, 0x4c00, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half 8.0)
@@ -1448,19 +1579,23 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
 ; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v2, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0)
@@ -1486,19 +1621,23 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
 ; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -1527,22 +1666,25 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
 ; GFX9-NEXT:    v_pk_min_f16 v3, v3, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
-; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
   %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
@@ -1571,19 +1713,23 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x half> %a
   %b.fneg = fneg <2 x half> %b
@@ -1610,21 +1756,25 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
   %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -1650,19 +1800,23 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
 ; GFX9-NEXT:    v_pk_min_f16 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v0, v4, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
@@ -1690,29 +1844,35 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
 ; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v5, v1
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0)
@@ -1740,29 +1900,35 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
 ; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -1799,33 +1965,37 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_pk_min_f16 v7, v7, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
-; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v6, v6, v10
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
   %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
@@ -1856,29 +2026,35 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x half> %a
   %b.fneg = fneg <3 x half> %b
@@ -1907,29 +2083,34 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v7, v1, 2.0
+; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    s_movk_i32 s0, 0x7e00
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s5
+; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s1
 ; GFX9-NEXT:    v_pk_min_f16 v4, v4, v2
-; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s4
+; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s0
 ; GFX9-NEXT:    v_pk_min_f16 v7, v7, v3
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s5
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
   %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -1957,29 +2138,35 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
 ; GFX9-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, 4.0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
-; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
@@ -2007,33 +2194,41 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v5, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v4, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0)
@@ -2061,33 +2256,41 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
 ; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v5
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c)
@@ -2124,37 +2327,43 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v1
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_pk_min_f16 v7, v7, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v5
-; GFX9-NEXT:    v_perm_b32 v2, v8, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v1, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v11
-; GFX9-NEXT:    v_perm_b32 v6, v9, v0, s4
+; GFX9-NEXT:    v_perm_b32 v6, v9, v0, s0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_pk_min_f16 v6, v6, v10
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
   %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
@@ -2185,33 +2394,41 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
 ; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <4 x half> %a
   %b.fneg = fneg <4 x half> %b
@@ -2240,35 +2457,41 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v4, v8, v1, s4
+; GFX9-NEXT:    v_perm_b32 v4, v8, v1, s0
 ; GFX9-NEXT:    v_pk_min_f16 v4, v4, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_perm_b32 v8, v5, v0, s4
+; GFX9-NEXT:    v_perm_b32 v8, v5, v0, s0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX9-NEXT:    v_pk_min_f16 v8, v8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_perm_b32 v1, v7, v1, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v7, v1, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>)
   %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c)
@@ -2296,33 +2519,41 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
 ; GFX9-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v2, v1, v4, s4
+; GFX9-NEXT:    v_perm_b32 v2, v1, v4, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s0
 ; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>)
@@ -2346,12 +2577,14 @@ define double @v_fminimum3_f64(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2377,12 +2610,14 @@ define double @v_fminimum3_f64_commute(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2404,19 +2639,20 @@ define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, do
 ;
 ; GFX9-LABEL: s_fminimum3_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX9-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
@@ -2447,12 +2683,14 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], |v[0:1]|, v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2479,12 +2717,14 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], |v[2:3]|
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2511,12 +2751,14 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], |v[4:5]|
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2543,12 +2785,14 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], |v[0:1]|, |v[2:3]|
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], |v[4:5]|
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2577,12 +2821,14 @@ define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], -v[0:1], -v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2611,12 +2857,14 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -|v[4:5]|
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2648,12 +2896,14 @@ define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], -v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2680,12 +2930,14 @@ define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], -v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2712,12 +2964,14 @@ define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2743,15 +2997,17 @@ define double @v_fminimum3_f64_const0(double %b, double %c) {
 ; GFX9-LABEL: v_fminimum3_f64_const0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], s[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40200000
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2777,14 +3033,15 @@ define double @v_fminimum3_f64__const2(double %a, double %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40200000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[0:1]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2810,12 +3067,14 @@ define double @v_fminimum3_f64_inlineimm0(double %b, double %c) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], 4.0
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2841,12 +3100,14 @@ define double @v_fminimum3_f64__inlineimm(double %a, double %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2871,17 +3132,18 @@ define double @v_fminimum3_f64_const1_const2(double %a) {
 ; GFX9-LABEL: v_fminimum3_f64_const1_const2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40200000
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40300000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40300000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[0:1]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2909,9 +3171,11 @@ define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c)
 ; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
@@ -2935,11 +3199,14 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float
 ; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
 ; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
@@ -2973,9 +3240,11 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
 ; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3002,11 +3271,13 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in
 ; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
@@ -3043,19 +3314,23 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b,
 ; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v5, s4
+; GFX9-NEXT:    v_perm_b32 v0, v1, v5, s0
 ; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
 ; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v1, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v5, s0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.minimum.f16(<2 x half> %max0, <2 x half> %c)
@@ -3080,12 +3355,14 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3095,3 +3372,6 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double
   %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1
   ret <2 x double> %insert.1
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX940: {{.*}}
+; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index b1c8107c3d1dd..e7c425a2d2752 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -698,12 +698,9 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
 ; SI-NEXT:    s_and_b32 s4, s6, 0x7fff
-; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
 ; SI-NEXT:    s_cmpk_lt_i32 s4, 0x7c00
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index 0adce2b84aa0d..3eb9d474ec030 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -63,17 +63,16 @@ define void @issue92561(ptr addrspace(1) %arg) {
 ; SDAG-NEXT:    image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; SDAG-NEXT:    image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; SDAG-NEXT:    image_sample_c_lz v2, [v1, v2, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT:    v_mov_b32_e32 v4, v1
 ; SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; SDAG-NEXT:    v_add_f32_e32 v0, v9, v0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v1
-; SDAG-NEXT:    v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v1
+; SDAG-NEXT:    v_add_f32_e32 v0, v3, v0
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e800000, v0
-; SDAG-NEXT:    image_store v[0:2], v[3:4], s[0:7] dim:SQ_RSRC_IMG_2D unorm
+; SDAG-NEXT:    image_store v[0:2], [v1, v1], s[0:7] dim:SQ_RSRC_IMG_2D unorm
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: issue92561:
@@ -131,7 +130,6 @@ define void @issue92561(ptr addrspace(1) %arg) {
 ; GISEL-NEXT:    image_sample_c_lz v0, [v2, v2, v0, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GISEL-NEXT:    image_sample_c_lz v3, [v2, v3, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GISEL-NEXT:    image_sample_c_lz v4, [v2, v2, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; GISEL-NEXT:    s_mov_b32 s21, s20
 ; GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -139,10 +137,9 @@ define void @issue92561(ptr addrspace(1) %arg) {
 ; GISEL-NEXT:    v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f32_e32 v0, v4, v0
-; GISEL-NEXT:    v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e800000, v0
-; GISEL-NEXT:    image_store v[1:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
+; GISEL-NEXT:    image_store v[1:3], [v2, v2], s[4:11] dim:SQ_RSRC_IMG_2D unorm
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %descriptor = load <8 x i32>, ptr addrspace(1) %arg, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll
new file mode 100644
index 0000000000000..73f6dcb3a2a1d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll
@@ -0,0 +1,32 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT160K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx941 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
+; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT32K %s
+
+; gfx950 supports upto 160 KB LDS memory. The generic target does not.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR-LIMIT160K: error: <unknown>:0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit'
+; ERROR-LIMIT64K: error: <unknown>:0:0: local memory (163844) exceeds limit (65536) in function 'test_lds_limit'
+; ERROR-LIMIT32K: error: <unknown>:0:0: local memory (163844) exceeds limit (32768) in function 'test_lds_limit'
+@dst = addrspace(3) global [40961 x i32] poison
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+  %gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+  store i32 %val, ptr addrspace(3) %gep
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
new file mode 100644
index 0000000000000..6ebfc9a5e9d4f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=MESA %s
+
+; gfx950 supports upto 160 KB configurable LDS memory.
+; This test checks the max and above the old i.e. 128 KiB size of LDS that can be allocated.
+
+@lds.i32 = addrspace(3) global i32 poison
+@lds.array.size.131076 = addrspace(3) global [32768 x i32] poison
+@lds.array.size.163840 = addrspace(3) global [40959 x i32] poison
+
+; GCN-LABEL: test_lds_array_size_131076:
+; GCN: .amdhsa_group_segment_fixed_size 131076
+; GCN: ; LDSByteSize: 131076 bytes/workgroup
+; MESA: granulated_lds_size = 65
+define amdgpu_kernel void @test_lds_array_size_131076() {
+  %gep = getelementptr inbounds [32768 x i32], ptr addrspace(3) @lds.array.size.131076, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_size_163840:
+; GCN: .amdhsa_group_segment_fixed_size 163840
+; GCN: ; LDSByteSize: 163840 bytes/workgroup
+; MESA: granulated_lds_size = 80
+define amdgpu_kernel void @test_lds_array_size_163840() {
+  %gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.size.163840 , i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
new file mode 100644
index 0000000000000..22cad8ab5f536
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=PAL %s
+
+; GFX950supports upto 160 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+; PAL: .shader_functions:
+; PAL: test_lds_array_i32:
+; PAL: .lds_size:       0x28000
+; PAL: test_lds_i32:
+; PAL: .lds_size:       0x4
+
+
+@lds.i32 = addrspace(3) global i32 poison
+@lds.array.i32 = addrspace(3) global [40959 x i32] poison
+
+define amdgpu_gfx void @test_lds_i32(i32 %val) {
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i32() {
+  %gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 8313f5b655efb..bd35ee3f00973 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll
new file mode 100644
index 0000000000000..8f67375a09cb7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
+
+; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.load.lds
+
+; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.load.lds),
+
+
+declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+
+;---------------------------------------------------------------------y
+; dwordx3
+;---------------------------------------------------------------------
+
+define amdgpu_ps void @global_load_lds_dwordx3_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
+; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX950-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    global_load_lds_dwordx3 v[0:1], off offset:16 sc0
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_readfirstlane_b32 m0, v2
+; GFX950-GISEL-NEXT:    s_nop 4
+; GFX950-GISEL-NEXT:    global_load_lds_dwordx3 v[0:1], off offset:16 sc0
+; GFX950-GISEL-NEXT:    s_endpgm
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dwordx3_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
+; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    global_load_lds_dwordx3 v1, s[0:1] offset:32 nt
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 3
+; GFX950-GISEL-NEXT:    global_load_lds_dwordx3 v0, s[0:1] offset:32 nt
+; GFX950-GISEL-NEXT:    s_endpgm
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 32, i32 2)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dwordx3_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
+; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr_and_vaddr:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX950-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr_and_vaddr:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX950-GISEL-NEXT:    s_nop 4
+; GFX950-GISEL-NEXT:    global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1
+; GFX950-GISEL-NEXT:    s_endpgm
+  %voffset.64 = zext i32 %voffset to i64
+  %gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 12, i32 48, i32 16)
+  ret void
+}
+
+;---------------------------------------------------------------------
+; dwordx4
+;---------------------------------------------------------------------
+
+define amdgpu_ps void @global_load_lds_dwordx4_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
+; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX950-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    global_load_lds_dwordx4 v[0:1], off offset:16 sc0
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_readfirstlane_b32 m0, v2
+; GFX950-GISEL-NEXT:    s_nop 4
+; GFX950-GISEL-NEXT:    global_load_lds_dwordx4 v[0:1], off offset:16 sc0
+; GFX950-GISEL-NEXT:    s_endpgm
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dwordx4_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
+; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    global_load_lds_dwordx4 v1, s[0:1] offset:32 nt
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 3
+; GFX950-GISEL-NEXT:    global_load_lds_dwordx4 v0, s[0:1] offset:32 nt
+; GFX950-GISEL-NEXT:    s_endpgm
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 32, i32 2)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dwordx4_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
+; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr_and_vaddr:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX950-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr_and_vaddr:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX950-GISEL-NEXT:    s_nop 4
+; GFX950-GISEL-NEXT:    global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1
+; GFX950-GISEL-NEXT:    s_endpgm
+  %voffset.64 = zext i32 %voffset to i64
+  %gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 16, i32 48, i32 16)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
index 3f88ab1f64912..40b4b33e74a6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -56,9 +56,9 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX10-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -92,9 +92,9 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -164,9 +164,9 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -382,8 +382,8 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ;
 ; GFX10-LABEL: gather4_l_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
-; GFX10-NEXT:    image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
index b75723f544d1d..e789b964d3cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
@@ -283,8 +283,8 @@ main_body:
 define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %fragid) {
 ; GFX11-LABEL: load_2dmsaa_a16:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x01,0x61,0xf0,0x01,0x00,0x00,0x00]
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x99,0x01,0x61,0xf0,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
index 4cd761b555d6e..22b4c5a7362ad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
@@ -1,12 +1,12 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s
+; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s
 
 ; Default NSA threshold is 3 addresses
 ; GCN-LABEL: {{^}}sample_2d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index 42005499bded8..10a85aa7c02c7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -107,9 +107,9 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX10-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -117,9 +117,9 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX11-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
@@ -153,9 +153,9 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
+; GFX10-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -163,9 +163,9 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
+; GFX11-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
@@ -245,9 +245,9 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -255,9 +255,9 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
@@ -424,9 +424,9 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -434,9 +434,9 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-NEXT:    image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
@@ -1304,15 +1304,15 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ;
 ; GFX10-LABEL: sample_l_2d:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
-; GFX10-NEXT:    image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: sample_l_2d:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100
-; GFX11-NEXT:    image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index 48491729f109a..be66d6516f438 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -110,28 +110,29 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX10-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
+; GFX10-NEXT:    image_sample v[0:1], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    global_store_dword v4, v3, s[12:13]
+; GFX10-NEXT:    global_store_dword v4, v1, s[12:13]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: image_sample_2d_f16_tfe:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    s_mov_b32 s14, exec_lo
 ; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v4
-; GFX11-NEXT:    v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX11-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
+; GFX11-NEXT:    image_sample v[0:1], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    global_store_b32 v4, v3, s[12:13]
+; GFX11-NEXT:    global_store_b32 v4, v1, s[12:13]
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: image_sample_2d_f16_tfe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
new file mode 100644
index 0000000000000..2da602713d72c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+; FIXME: bfloat vector arguments are broken in globalisel.
+; https://github.com/llvm/llvm-project/issues/77055
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat>, <8 x bfloat>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.32x32x16.bf16
+; --------------------------------------------------------------------
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NEXT:    v_mov_b32_e32 v9, s17
+; GCN-NEXT:    v_mov_b32_e32 v10, s18
+; GCN-NEXT:    v_mov_b32_e32 v11, s19
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  store volatile <16 x float> %result, ptr addrspace(1) null
+  store volatile <16 x float> %arg2, ptr addrspace(1) null
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) #1 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], 48
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], 32
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GCN-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NEXT:    v_mov_b32_e32 v3, s23
+; GCN-NEXT:    v_mov_b32_e32 v9, s17
+; GCN-NEXT:    v_mov_b32_e32 v10, s18
+; GCN-NEXT:    v_mov_b32_e32 v11, s19
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
+  store volatile <16 x float> %result, ptr addrspace(1) null
+  store volatile <16 x float> %arg2, ptr addrspace(1) null
+  ret void
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
+  ret <16 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a31, s23
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a30, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a29, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a28, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a27, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a26, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a25, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a24, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a23, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a22, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a21, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a20, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a19, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a18, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a17, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a16, s8
+; GCN-NEXT:    v_mov_b32_e32 v8, s20
+; GCN-NEXT:    v_mov_b32_e32 v9, s21
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
+; GCN-NEXT:    v_mov_b32_e32 v10, s22
+; GCN-NEXT:    v_mov_b32_e32 v11, s23
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  store volatile <16 x float> %arg2, ptr addrspace(1) %out
+  store volatile <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a31, s23
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a30, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a29, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a28, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a27, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a26, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a25, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a24, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a23, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a22, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a21, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a20, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a19, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a18, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a17, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a16, s8
+; GCN-NEXT:    v_mov_b32_e32 v8, s20
+; GCN-NEXT:    v_mov_b32_e32 v9, s21
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v10, s22
+; GCN-NEXT:    v_mov_b32_e32 v11, s23
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
+  store volatile <16 x float> %arg2, ptr addrspace(1) %out
+  store volatile <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  store <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
+  store <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
new file mode 100644
index 0000000000000..88d04e9fb428a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.16x16x32.f16
+; --------------------------------------------------------------------
+
+define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_16x16x32_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_f16__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) {
+; GCN-LABEL: test_mfma_f32_16x16x32_f16__mac:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v0
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v3
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3]
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_f32_16x16x32_f16___flags__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) {
+; GCN-LABEL: test_mfma_f32_16x16x32_f16___flags__mac:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v0
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v3
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
+  ret <4 x float> %result
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.f32.32x32x16.f16
+; --------------------------------------------------------------------
+
+define <16 x float> @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<16 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) {
+; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v0
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v3
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v5
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v6
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v7
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[20:23], a[0:15]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_f32_32x32x16_f16__flags__mac(<16 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) {
+; GCN-LABEL: test_mfma_f32_32x32x16_f16__flags__mac:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v0
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v3
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v5
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v6
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v7
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[20:23], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
+  ret <16 x float> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
new file mode 100644
index 0000000000000..eeef4eeb65a69
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
@@ -0,0 +1,32 @@
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.prng.b32(i32) #0
+
+; GCN-LABEL: {{^}}prng_b32:
+; GCN: v_prng_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @prng_b32(ptr addrspace(1) %out, i32 %src) #1 {
+  %prng = call i32 @llvm.amdgcn.prng.b32(i32 %src) #0
+  store i32 %prng, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}prng_b32_constant_4
+; GCN: v_prng_b32_e32 {{v[0-9]+}}, 4
+define amdgpu_kernel void @prng_b32_constant_4(ptr addrspace(1) %out) #1 {
+  %prng = call i32 @llvm.amdgcn.prng.b32(i32 4) #0
+  store i32 %prng, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}prng_b32_constant_100
+; GCN: v_prng_b32_e32 {{v[0-9]+}}, 0x64
+define amdgpu_kernel void @prng_b32_constant_100(ptr addrspace(1) %out) #1 {
+  %prng = call i32 @llvm.amdgcn.prng.b32(i32 100) #0
+  store i32 %prng, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll
new file mode 100644
index 0000000000000..58b1d0da4a5f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll
@@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
+
+; FIXME: Not a great error
+; ERR-SDAG: LLVM ERROR: Do not know how to expand this operator's operand!
+; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.ptr.buffer.load.lds),
+
+declare void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+
+;---------------------------------------------------------------------y
+; dwordx3
+;---------------------------------------------------------------------
+
+define amdgpu_ps float @buffer_load_lds_dwordx3(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GFX950-LABEL: buffer_load_lds_dwordx3:
+; GFX950:       ; %bb.0: ; %main_body
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dword off, s[0:3], 0 lds
+; GFX950-NEXT:    buffer_load_dword off, s[0:3], 0 offset:4 sc0 lds
+; GFX950-NEXT:    buffer_load_dword off, s[0:3], 0 offset:8 nt lds
+; GFX950-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    ds_read_b32 v0, v0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_imm_voffset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_imm_voffset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0x800
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 2048, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_v_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_s_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 off, s[0:3], s5 lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 0, i32 %soffset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_vs_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v0, s[0:3], s5 offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 %soffset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_vs_imm_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v0, s[0:3], s5 offen offset:2048 lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 %soffset, i32 2048, i32 0)
+  ret void
+}
+
+;---------------------------------------------------------------------y
+; dwordx4
+;---------------------------------------------------------------------
+
+define amdgpu_ps float @buffer_load_lds_dwordx4(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GFX950-LABEL: buffer_load_lds_dwordx4:
+; GFX950:       ; %bb.0: ; %main_body
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dword off, s[0:3], 0 lds
+; GFX950-NEXT:    buffer_load_dword off, s[0:3], 0 offset:4 sc0 lds
+; GFX950-NEXT:    buffer_load_dword off, s[0:3], 0 offset:8 nt lds
+; GFX950-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    ds_read_b32 v0, v0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_imm_voffset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_imm_voffset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0x800
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 2048, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_v_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_s_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 off, s[0:3], s5 lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 0, i32 %soffset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_vs_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v0, s[0:3], s5 offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 %soffset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_vs_imm_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v0, s[0:3], s5 offen offset:2048 lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 %soffset, i32 2048, i32 0)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX950-GISEL: {{.*}}
+; GFX950-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll
new file mode 100644
index 0000000000000..cfe9545b074e3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
+
+; ERR-SDAG: LLVM ERROR: Do not know how to expand this operator's operand!
+; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.ptr.buffer.load.lds),
+
+declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+
+;---------------------------------------------------------------------y
+; dwordx3
+;---------------------------------------------------------------------
+
+define amdgpu_ps float @buffer_load_lds_dwordx3(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GFX950-SDAG-LABEL: buffer_load_lds_dwordx3:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 8
+; GFX950-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 idxen lds
+; GFX950-SDAG-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds
+; GFX950-SDAG-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    ds_read_b32 v0, v0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: buffer_load_lds_dwordx3:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 8
+; GFX950-GISEL-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 idxen lds
+; GFX950-GISEL-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds
+; GFX950-GISEL-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    ds_read_b32 v0, v0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    ; return to shader part epilog
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 4, i32 1)
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 8, i32 2)
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_imm_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:2048 lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_v_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v[0:1], s[0:3], 0 idxen offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_s_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v0, s[0:3], s5 idxen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_vs_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v[0:1], s[0:3], s5 idxen offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx3_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx3_vs_imm_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx3 v[0:1], s[0:3], s5 idxen offen offset:2048 lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0)
+  ret void
+}
+
+;---------------------------------------------------------------------y
+; dwordx4
+;---------------------------------------------------------------------
+
+define amdgpu_ps float @buffer_load_lds_dwordx4(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GFX950-SDAG-LABEL: buffer_load_lds_dwordx4:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 8
+; GFX950-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 idxen lds
+; GFX950-SDAG-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds
+; GFX950-SDAG-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    ds_read_b32 v0, v0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: buffer_load_lds_dwordx4:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 8
+; GFX950-GISEL-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 idxen lds
+; GFX950-GISEL-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds
+; GFX950-GISEL-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    ds_read_b32 v0, v0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    ; return to shader part epilog
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 4, i32 1)
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 8, i32 2)
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_imm_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:2048 lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 0, i32 0, i32 2048, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_v_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v[0:1], s[0:3], 0 idxen offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_s_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v0, s[0:3], s5 idxen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_vs_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v[0:1], s[0:3], s5 idxen offen lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dwordx4_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX950-LABEL: buffer_load_lds_dwordx4_vs_imm_offset:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_mov_b32 m0, s4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    buffer_load_dwordx4 v[0:1], s[0:3], s5 idxen offen offset:2048 lds
+; GFX950-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index d90c4a75ac5de..e782f53cee608 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
@@ -30,24 +30,24 @@ define half @v_maximum_f16(half %src0, half %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16:
 ; GFX10:       ; %bb.0:
@@ -102,12 +102,6 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_f16__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_f16__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -156,24 +150,24 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f16__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f16__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f16__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f16__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nsz:
 ; GFX10:       ; %bb.0:
@@ -228,12 +222,6 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_f16__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -284,26 +272,26 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f16__nnan_src0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f16__nnan_src0:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX940-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f16__nnan_src0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX900-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f16__nnan_src0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX950-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan_src0:
 ; GFX10:       ; %bb.0:
@@ -365,26 +353,26 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f16__nnan_src1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f16__nnan_src1:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX940-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f16__nnan_src1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX900-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f16__nnan_src1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX950-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan_src1:
 ; GFX10:       ; %bb.0:
@@ -453,34 +441,34 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_maximum_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s17
-; GFX9-NEXT:    v_max_f16_e32 v1, s16, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v0
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_maximum_f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_max_f16_e32 v1, s0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v0
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_maximum_f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s17
+; GFX900-NEXT:    v_max_f16_e32 v1, s16, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, s16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_maximum_f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v0
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_maximum_f16:
 ; GFX10:       ; %bb.0:
@@ -567,35 +555,35 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v2f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f16:
 ; GFX10:       ; %bb.0:
@@ -668,12 +656,6 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v2f16__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v2f16__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -736,35 +718,35 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f16__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v2f16__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f16__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f16__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f16__nsz:
 ; GFX10:       ; %bb.0:
@@ -837,12 +819,6 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v2f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -917,50 +893,50 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_maximum_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s17
-; GFX9-NEXT:    v_mov_b32_e32 v1, s17
-; GFX9-NEXT:    s_lshr_b32 s4, s17, 16
-; GFX9-NEXT:    v_pk_max_f16 v1, s16, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s16, v0
-; GFX9-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s5, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v0
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_maximum_v2f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX940-NEXT:    v_pk_max_f16 v1, s0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX940-NEXT:    v_mov_b32_e32 v3, s1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX940-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v0
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_maximum_v2f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s17
+; GFX900-NEXT:    v_mov_b32_e32 v1, s17
+; GFX900-NEXT:    s_lshr_b32 s4, s17, 16
+; GFX900-NEXT:    v_pk_max_f16 v1, s16, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, s16, v0
+; GFX900-NEXT:    s_lshr_b32 s5, s16, 16
+; GFX900-NEXT:    v_mov_b32_e32 v3, s4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, s5, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_maximum_v2f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_mov_b32_e32 v1, s1
+; GFX950-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX950-NEXT:    v_pk_max_f16 v1, s0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX950-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX950-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX950-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v0
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_maximum_v2f16:
 ; GFX10:       ; %bb.0:
@@ -1065,41 +1041,41 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v3f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX940-NEXT:    v_pk_max_f16 v3, v0, v2
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX950-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f16:
 ; GFX10:       ; %bb.0:
@@ -1187,13 +1163,6 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v3f16__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX940-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v3f16__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1269,41 +1238,41 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f16__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v3f16__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX940-NEXT:    v_pk_max_f16 v3, v0, v2
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f16__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f16__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX950-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f16__nsz:
 ; GFX10:       ; %bb.0:
@@ -1391,13 +1360,6 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX940-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v3f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1487,51 +1449,51 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v4f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v3, v0, v2
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX940-NEXT:    v_perm_b32 v1, v1, v6, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX950-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f16:
 ; GFX10:       ; %bb.0:
@@ -1635,13 +1597,6 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v4f16__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX940-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v4f16__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1731,51 +1686,51 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f16__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v4f16__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v3, v0, v2
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX940-NEXT:    v_perm_b32 v1, v1, v6, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f16__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f16__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX950-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f16__nsz:
 ; GFX10:       ; %bb.0:
@@ -1879,13 +1834,6 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX940-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v4f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2023,83 +1971,83 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v8f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v8, v3, v7
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
-; GFX9-NEXT:    v_pk_max_f16 v7, v2, v6
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
-; GFX9-NEXT:    v_pk_max_f16 v6, v1, v5
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
-; GFX9-NEXT:    v_pk_max_f16 v5, v0, v4
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v7, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v10, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v8f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v8, v3, v7
-; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v7, v2, v6
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
-; GFX940-NEXT:    v_perm_b32 v3, v3, v10, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v6, v1, v5
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
-; GFX940-NEXT:    v_perm_b32 v2, v2, v8, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v5, v0, v4
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
-; GFX940-NEXT:    v_perm_b32 v1, v1, v7, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v6, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v8f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v8, v3, v7
+; GFX900-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX900-NEXT:    v_pk_max_f16 v7, v2, v6
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX900-NEXT:    v_pk_max_f16 v6, v1, v5
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX900-NEXT:    v_pk_max_f16 v5, v0, v4
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v7, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v10, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v8f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v8, v3, v7
+; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v7, v2, v6
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX950-NEXT:    v_perm_b32 v3, v3, v10, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v6, v1, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX950-NEXT:    v_perm_b32 v2, v2, v8, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v5, v0, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX950-NEXT:    v_perm_b32 v1, v1, v7, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v6, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v8f16:
 ; GFX10:       ; %bb.0:
@@ -2400,147 +2348,147 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v16f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v16, v7, v15
-; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
-; GFX9-NEXT:    v_pk_max_f16 v15, v6, v14
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
-; GFX9-NEXT:    v_pk_max_f16 v14, v5, v13
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
-; GFX9-NEXT:    v_pk_max_f16 v13, v4, v12
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
-; GFX9-NEXT:    v_pk_max_f16 v12, v3, v11
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
-; GFX9-NEXT:    v_pk_max_f16 v11, v2, v10
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
-; GFX9-NEXT:    v_pk_max_f16 v10, v1, v9
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
-; GFX9-NEXT:    v_pk_max_f16 v9, v0, v8
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v10, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v12, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v13, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v14, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v15, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v18, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v16f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_max_f16 v16, v7, v15
-; GFX940-NEXT:    v_mov_b32_e32 v17, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v15, v6, v14
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
-; GFX940-NEXT:    v_perm_b32 v7, v7, v18, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v14, v5, v13
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
-; GFX940-NEXT:    v_perm_b32 v6, v6, v16, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v13, v4, v12
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
-; GFX940-NEXT:    v_perm_b32 v5, v5, v15, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v12, v3, v11
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
-; GFX940-NEXT:    v_perm_b32 v4, v4, v14, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v11, v2, v10
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
-; GFX940-NEXT:    v_perm_b32 v3, v3, v13, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v10, v1, v9
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
-; GFX940-NEXT:    v_perm_b32 v2, v2, v12, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_max_f16 v9, v0, v8
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
-; GFX940-NEXT:    v_perm_b32 v1, v1, v11, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v10, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v16f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_max_f16 v16, v7, v15
+; GFX900-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX900-NEXT:    v_pk_max_f16 v15, v6, v14
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
+; GFX900-NEXT:    v_pk_max_f16 v14, v5, v13
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GFX900-NEXT:    v_pk_max_f16 v13, v4, v12
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
+; GFX900-NEXT:    v_pk_max_f16 v12, v3, v11
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX900-NEXT:    v_pk_max_f16 v11, v2, v10
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
+; GFX900-NEXT:    v_pk_max_f16 v10, v1, v9
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
+; GFX900-NEXT:    v_pk_max_f16 v9, v0, v8
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v10, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v11, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v12, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v13, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v14, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v15, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v18, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v16f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_max_f16 v16, v7, v15
+; GFX950-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v15, v6, v14
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX950-NEXT:    v_perm_b32 v7, v7, v18, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v14, v5, v13
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX950-NEXT:    v_perm_b32 v6, v6, v16, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v13, v4, v12
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX950-NEXT:    v_perm_b32 v5, v5, v15, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v12, v3, v11
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX950-NEXT:    v_perm_b32 v4, v4, v14, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v11, v2, v10
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX950-NEXT:    v_perm_b32 v3, v3, v13, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v10, v1, v9
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX950-NEXT:    v_perm_b32 v2, v2, v12, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_max_f16 v9, v0, v8
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX950-NEXT:    v_perm_b32 v1, v1, v11, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v10, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v16f16:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 48851cb030233..c1fdfa2c4cf9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
@@ -26,24 +27,24 @@ define float @v_maximum_f32(float %src0, float %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32:
 ; GFX10:       ; %bb.0:
@@ -94,12 +95,6 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) {
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_f32__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_f32__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -144,24 +139,24 @@ define float @v_maximum_f32__nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f32__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f32__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f32__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f32__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nsz:
 ; GFX10:       ; %bb.0:
@@ -212,12 +207,6 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_f32__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -264,26 +253,26 @@ define float @v_maximum_f32__nnan_src0(float %arg0, float %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f32__nnan_src0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f32__nnan_src0:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f32__nnan_src0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX900-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f32__nnan_src0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX950-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan_src0:
 ; GFX10:       ; %bb.0:
@@ -341,26 +330,26 @@ define float @v_maximum_f32__nnan_src1(float %src0, float %arg1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f32__nnan_src1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f32__nnan_src1:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f32__nnan_src1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX900-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f32__nnan_src1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX950-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan_src1:
 ; GFX10:       ; %bb.0:
@@ -424,32 +413,32 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_maximum_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s17
-; GFX9-NEXT:    v_max_f32_e32 v1, s16, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v0
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_maximum_f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_max_f32_e32 v1, s0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v0
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_maximum_f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s17
+; GFX900-NEXT:    v_max_f32_e32 v1, s16, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, s16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_maximum_f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v0
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_maximum_f32:
 ; GFX10:       ; %bb.0:
@@ -517,31 +506,31 @@ define <2 x float> @v_maximum_v2f32(<2 x float> %src0, <2 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v3
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v2f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX940-NEXT:    v_max_f32_e32 v2, v1, v3
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX900-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX950-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32:
 ; GFX10:       ; %bb.0:
@@ -601,13 +590,6 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v2f32__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX940-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -660,31 +642,31 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f32__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v3
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v2f32__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX940-NEXT:    v_max_f32_e32 v2, v1, v3
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f32__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX900-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f32__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX950-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nsz:
 ; GFX10:       ; %bb.0:
@@ -744,13 +726,6 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX940-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -813,40 +788,40 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_maximum_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s19
-; GFX9-NEXT:    v_max_f32_e32 v1, s17, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s17, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s18
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, s16, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v[0:1]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_maximum_v2f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s3
-; GFX940-NEXT:    v_max_f32_e32 v1, s1, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s1, v0
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_max_f32_e32 v3, s0, v0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v[0:1]
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_maximum_v2f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s19
+; GFX900-NEXT:    v_max_f32_e32 v1, s17, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, s17, v0
+; GFX900-NEXT:    v_mov_b32_e32 v0, s18
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX900-NEXT:    v_max_f32_e32 v3, s16, v0
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, s16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_maximum_v2f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s3
+; GFX950-NEXT:    v_max_f32_e32 v1, s1, v0
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s1, v0
+; GFX950-NEXT:    v_mov_b32_e32 v0, s2
+; GFX950-NEXT:    v_max_f32_e32 v3, s0, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v[0:1]
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_maximum_v2f32:
 ; GFX10:       ; %bb.0:
@@ -927,38 +902,38 @@ define <3 x float> @v_maximum_v3f32(<3 x float> %src0, <3 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v6, v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v1, v4
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v2, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v3f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v6, v0, v3
-; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v1, v4
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v5
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX900-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX900-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX950-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX950-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX950-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32:
 ; GFX10:       ; %bb.0:
@@ -1028,14 +1003,6 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v3f32__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX940-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1097,38 +1064,38 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f32__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v6, v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v1, v4
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v2, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v3f32__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v6, v0, v3
-; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v1, v4
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX940-NEXT:    v_max_f32_e32 v3, v2, v5
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f32__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX900-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX900-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f32__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX950-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX950-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX950-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nsz:
 ; GFX10:       ; %bb.0:
@@ -1198,14 +1165,6 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX940-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1273,45 +1232,45 @@ define <4 x float> @v_maximum_v4f32(<4 x float> %src0, <4 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v8, v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v4, v2, v6
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v4, v3, v7
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v4f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v8, v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX940-NEXT:    v_max_f32_e32 v4, v1, v5
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v6
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v7
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX900-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX900-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX900-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX950-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX950-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX950-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32:
 ; GFX10:       ; %bb.0:
@@ -1391,15 +1350,6 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v4f32__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX940-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1469,45 +1419,45 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f32__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v8, v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v4, v2, v6
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v4, v3, v7
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v4f32__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v8, v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX940-NEXT:    v_max_f32_e32 v4, v1, v5
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v6
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v7
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f32__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX900-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX900-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX900-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f32__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX950-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX950-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX950-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nsz:
 ; GFX10:       ; %bb.0:
@@ -1587,15 +1537,6 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX940-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1689,73 +1630,73 @@ define <8 x float> @v_maximum_v8f32(<8 x float> %src0, <8 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v8f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v16, v0, v8
-; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
-; GFX9-NEXT:    v_max_f32_e32 v8, v1, v9
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
-; GFX9-NEXT:    v_max_f32_e32 v8, v2, v10
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
-; GFX9-NEXT:    v_max_f32_e32 v8, v3, v11
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
-; GFX9-NEXT:    v_max_f32_e32 v8, v4, v12
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
-; GFX9-NEXT:    v_max_f32_e32 v8, v5, v13
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
-; GFX9-NEXT:    v_max_f32_e32 v8, v6, v14
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
-; GFX9-NEXT:    v_max_f32_e32 v8, v7, v15
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v8f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v16, v0, v8
-; GFX940-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
-; GFX940-NEXT:    v_max_f32_e32 v8, v1, v9
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
-; GFX940-NEXT:    v_max_f32_e32 v8, v2, v10
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
-; GFX940-NEXT:    v_max_f32_e32 v8, v3, v11
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
-; GFX940-NEXT:    v_max_f32_e32 v8, v4, v12
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
-; GFX940-NEXT:    v_max_f32_e32 v8, v5, v13
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
-; GFX940-NEXT:    v_max_f32_e32 v8, v6, v14
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
-; GFX940-NEXT:    v_max_f32_e32 v8, v7, v15
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v8f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v16, v0, v8
+; GFX900-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
+; GFX900-NEXT:    v_max_f32_e32 v8, v1, v9
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
+; GFX900-NEXT:    v_max_f32_e32 v8, v2, v10
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
+; GFX900-NEXT:    v_max_f32_e32 v8, v3, v11
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
+; GFX900-NEXT:    v_max_f32_e32 v8, v4, v12
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
+; GFX900-NEXT:    v_max_f32_e32 v8, v5, v13
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
+; GFX900-NEXT:    v_max_f32_e32 v8, v6, v14
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
+; GFX900-NEXT:    v_max_f32_e32 v8, v7, v15
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v8f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v16, v0, v8
+; GFX950-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX950-NEXT:    v_max_f32_e32 v8, v1, v9
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
+; GFX950-NEXT:    v_max_f32_e32 v8, v2, v10
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
+; GFX950-NEXT:    v_max_f32_e32 v8, v3, v11
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
+; GFX950-NEXT:    v_max_f32_e32 v8, v4, v12
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
+; GFX950-NEXT:    v_max_f32_e32 v8, v5, v13
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
+; GFX950-NEXT:    v_max_f32_e32 v8, v6, v14
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
+; GFX950-NEXT:    v_max_f32_e32 v8, v7, v15
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v8f32:
 ; GFX10:       ; %bb.0:
@@ -1968,136 +1909,136 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v16f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[16:17], v0, v16
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX9-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
-; GFX9-NEXT:    v_max_f32_e32 v18, v13, v29
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[28:29], v13, v29
-; GFX9-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v6, v22
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[14:15], v7, v23
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[18:19], v8, v24
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[20:21], v9, v25
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[22:23], v10, v26
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[24:25], v11, v27
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[26:27], v12, v28
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_max_f32_e32 v19, v14, v30
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v17, v18, s[28:29]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v17, v0, s[16:17]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v17, v6, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v17, v7, s[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v17, v11, s[24:25]
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v17, v12, s[26:27]
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v17, v19, s[30:31]
-; GFX9-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v18, v15, v16
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v16f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v31, off, s32
-; GFX940-NEXT:    v_mov_b32_e32 v32, 0x7fc00000
-; GFX940-NEXT:    v_max_f32_e32 v33, v0, v16
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
-; GFX940-NEXT:    v_max_f32_e32 v34, v1, v17
-; GFX940-NEXT:    v_max_f32_e32 v35, v2, v18
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
-; GFX940-NEXT:    v_max_f32_e32 v36, v3, v19
-; GFX940-NEXT:    v_max_f32_e32 v37, v4, v20
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v32, v34, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
-; GFX940-NEXT:    v_max_f32_e32 v38, v5, v21
-; GFX940-NEXT:    v_max_f32_e32 v39, v6, v22
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v32, v35, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
-; GFX940-NEXT:    v_max_f32_e32 v48, v7, v23
-; GFX940-NEXT:    v_max_f32_e32 v49, v8, v24
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v32, v36, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
-; GFX940-NEXT:    v_max_f32_e32 v50, v9, v25
-; GFX940-NEXT:    v_max_f32_e32 v51, v10, v26
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v32, v37, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
-; GFX940-NEXT:    v_max_f32_e32 v52, v11, v27
-; GFX940-NEXT:    v_max_f32_e32 v53, v12, v28
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v32, v38, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
-; GFX940-NEXT:    v_max_f32_e32 v54, v13, v29
-; GFX940-NEXT:    v_max_f32_e32 v55, v14, v30
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v32, v39, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v16, v15, v31
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v32, v48, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v8, v32, v49, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v9, v32, v50, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v10, v32, v51, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v11, v32, v52, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v12, v32, v53, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v13, v32, v54, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v14, v32, v55, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v16f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[16:17], v0, v16
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX900-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX900-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX900-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX900-NEXT:    v_max_f32_e32 v18, v13, v29
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[28:29], v13, v29
+; GFX900-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
+; GFX900-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
+; GFX900-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
+; GFX900-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[12:13], v6, v22
+; GFX900-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[14:15], v7, v23
+; GFX900-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[18:19], v8, v24
+; GFX900-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[20:21], v9, v25
+; GFX900-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[22:23], v10, v26
+; GFX900-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[24:25], v11, v27
+; GFX900-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[26:27], v12, v28
+; GFX900-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX900-NEXT:    v_max_f32_e32 v19, v14, v30
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v13, v17, v18, s[28:29]
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v17, v0, s[16:17]
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v17, v2, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v17, v4, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v17, v6, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v17, v7, s[14:15]
+; GFX900-NEXT:    v_cndmask_b32_e64 v8, v17, v8, s[18:19]
+; GFX900-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[20:21]
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v17, v10, s[22:23]
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v17, v11, s[24:25]
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v17, v12, s[26:27]
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v17, v19, s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v18, v15, v16
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v15, v16
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v16f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_mov_b32_e32 v32, 0x7fc00000
+; GFX950-NEXT:    v_max_f32_e32 v33, v0, v16
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX950-NEXT:    v_max_f32_e32 v34, v1, v17
+; GFX950-NEXT:    v_max_f32_e32 v35, v2, v18
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX950-NEXT:    v_max_f32_e32 v36, v3, v19
+; GFX950-NEXT:    v_max_f32_e32 v37, v4, v20
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v32, v34, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX950-NEXT:    v_max_f32_e32 v38, v5, v21
+; GFX950-NEXT:    v_max_f32_e32 v39, v6, v22
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v32, v35, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX950-NEXT:    v_max_f32_e32 v48, v7, v23
+; GFX950-NEXT:    v_max_f32_e32 v49, v8, v24
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v32, v36, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX950-NEXT:    v_max_f32_e32 v50, v9, v25
+; GFX950-NEXT:    v_max_f32_e32 v51, v10, v26
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v32, v37, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX950-NEXT:    v_max_f32_e32 v52, v11, v27
+; GFX950-NEXT:    v_max_f32_e32 v53, v12, v28
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v32, v38, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX950-NEXT:    v_max_f32_e32 v54, v13, v29
+; GFX950-NEXT:    v_max_f32_e32 v55, v14, v30
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v32, v39, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_max_f32_e32 v16, v15, v31
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v32, v48, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v32, v49, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v32, v50, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v32, v51, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v32, v52, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v32, v53, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v32, v54, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v32, v55, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v16f32:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index 80a0a194713d9..e354ec6fb3dd7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
@@ -28,26 +29,26 @@ define double @v_maximum_f64(double %src0, double %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f64:
 ; GFX10:       ; %bb.0:
@@ -100,12 +101,6 @@ define double @v_maximum_f64__nnan(double %src0, double %src1) {
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_f64__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_f64__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -152,26 +147,26 @@ define double @v_maximum_f64__nsz(double %src0, double %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f64__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f64__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f64__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f64__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f64__nsz:
 ; GFX10:       ; %bb.0:
@@ -224,12 +219,6 @@ define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) {
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_f64__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -278,28 +267,28 @@ define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f64__nnan_src0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f64__nnan_src0:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f64__nnan_src0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX900-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f64__nnan_src0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX950-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f64__nnan_src0:
 ; GFX10:       ; %bb.0:
@@ -362,28 +351,28 @@ define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f64__nnan_src1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_f64__nnan_src1:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f64__nnan_src1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX900-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f64__nnan_src1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX950-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f64__nnan_src1:
 ; GFX10:       ; %bb.0:
@@ -454,35 +443,35 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_maximum_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s18
-; GFX9-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-NEXT:    v_max_f64 v[2:3], s[16:17], v[0:1]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v[0:1]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_maximum_f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v[0:1]
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_maximum_f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s18
+; GFX900-NEXT:    v_mov_b32_e32 v1, s19
+; GFX900-NEXT:    v_max_f64 v[2:3], s[16:17], v[0:1]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_maximum_f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX950-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v[0:1]
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_maximum_f64:
 ; GFX10:       ; %bb.0:
@@ -555,35 +544,35 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v2f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX950-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64:
 ; GFX10:       ; %bb.0:
@@ -648,13 +637,6 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v2f64__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v2f64__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -712,35 +694,35 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f64__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v2f64__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f64__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f64__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX950-NEXT:    v_max_f64 v[4:5], v[2:3], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f64__nsz:
 ; GFX10:       ; %bb.0:
@@ -805,13 +787,6 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v2f64__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v2f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -883,46 +858,46 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_maximum_v2f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s22
-; GFX9-NEXT:    v_mov_b32_e32 v4, s20
-; GFX9-NEXT:    v_mov_b32_e32 v1, s23
-; GFX9-NEXT:    v_mov_b32_e32 v5, s21
-; GFX9-NEXT:    v_max_f64 v[2:3], s[18:19], v[0:1]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
-; GFX9-NEXT:    v_max_f64 v[0:1], s[16:17], v[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v[0:3]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_maximum_v2f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
-; GFX940-NEXT:    v_max_f64 v[2:3], s[2:3], v[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, s[2:3], v[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
-; GFX940-NEXT:    v_max_f64 v[4:5], s[0:1], v[0:1]
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v[0:3]
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_maximum_v2f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s22
+; GFX900-NEXT:    v_mov_b32_e32 v4, s20
+; GFX900-NEXT:    v_mov_b32_e32 v1, s23
+; GFX900-NEXT:    v_mov_b32_e32 v5, s21
+; GFX900-NEXT:    v_max_f64 v[2:3], s[18:19], v[0:1]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
+; GFX900-NEXT:    v_max_f64 v[0:1], s[16:17], v[4:5]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v[0:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_maximum_v2f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT:    v_max_f64 v[2:3], s[2:3], v[0:1]
+; GFX950-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, s[2:3], v[0:1]
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT:    v_max_f64 v[4:5], s[0:1], v[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v[0:3]
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_maximum_v2f64:
 ; GFX10:       ; %bb.0:
@@ -1012,44 +987,44 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[8:9], v[4:5], v[10:11]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v3f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX940-NEXT:    v_max_f64 v[6:7], v[4:5], v[10:11]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[8:9], v[4:5], v[10:11]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v8, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
+; GFX950-NEXT:    v_max_f64 v[6:7], v[4:5], v[10:11]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64:
 ; GFX10:       ; %bb.0:
@@ -1125,14 +1100,6 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v3f64__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v3f64__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1201,44 +1168,44 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f64__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[8:9], v[4:5], v[10:11]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v3f64__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX940-NEXT:    v_max_f64 v[6:7], v[4:5], v[10:11]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f64__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[8:9], v[4:5], v[10:11]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v8, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f64__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[12:13], v[0:1], v[6:7]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
+; GFX950-NEXT:    v_max_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
+; GFX950-NEXT:    v_max_f64 v[6:7], v[4:5], v[10:11]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f64__nsz:
 ; GFX10:       ; %bb.0:
@@ -1314,14 +1281,6 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v3f64__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v3f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1398,53 +1357,53 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11]
-; GFX9-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13]
-; GFX9-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15]
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v4f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX940-NEXT:    v_max_f64 v[8:9], v[4:5], v[12:13]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX940-NEXT:    v_max_f64 v[8:9], v[6:7], v[14:15]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX900-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX900-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
+; GFX950-NEXT:    v_max_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
+; GFX950-NEXT:    v_max_f64 v[8:9], v[6:7], v[14:15]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64:
 ; GFX10:       ; %bb.0:
@@ -1532,15 +1491,6 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v4f64__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX940-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v4f64__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1620,53 +1570,53 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f64__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX9-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11]
-; GFX9-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13]
-; GFX9-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15]
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v4f64__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX940-NEXT:    v_max_f64 v[8:9], v[4:5], v[12:13]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX940-NEXT:    v_max_f64 v[8:9], v[6:7], v[14:15]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f64__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX900-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX900-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX900-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f64__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX950-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
+; GFX950-NEXT:    v_max_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
+; GFX950-NEXT:    v_max_f64 v[8:9], v[6:7], v[14:15]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f64__nsz:
 ; GFX10:       ; %bb.0:
@@ -1754,15 +1704,6 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_maximum_v4f64__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX940-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_maximum_v4f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1878,89 +1819,89 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v19, v34, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v8f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_max_f64 v[32:33], v[2:3], v[18:19]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
-; GFX9-NEXT:    v_max_f64 v[18:19], v[4:5], v[20:21]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[16:17]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v34, 0x7ff80000
-; GFX9-NEXT:    v_max_f64 v[20:21], v[6:7], v[22:23]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23]
-; GFX9-NEXT:    v_max_f64 v[16:17], v[8:9], v[24:25]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25]
-; GFX9-NEXT:    v_max_f64 v[22:23], v[10:11], v[26:27]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27]
-; GFX9-NEXT:    v_max_f64 v[24:25], v[12:13], v[28:29]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v34, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v32, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v33, v34, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v18, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v19, v34, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v20, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v21, v34, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v16, 0, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v34, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v22, 0, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v23, v34, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v24, 0, s[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v25, v34, s[14:15]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[18:19], v[14:15], v[30:31]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v18, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v19, v34, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v8f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v31, off, s32
-; GFX940-NEXT:    v_mov_b32_e32 v54, 0x7ff80000
-; GFX940-NEXT:    v_max_f64 v[32:33], v[0:1], v[16:17]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
-; GFX940-NEXT:    v_max_f64 v[34:35], v[2:3], v[18:19]
-; GFX940-NEXT:    v_max_f64 v[36:37], v[4:5], v[20:21]
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v32, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v33, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
-; GFX940-NEXT:    v_max_f64 v[38:39], v[6:7], v[22:23]
-; GFX940-NEXT:    v_max_f64 v[48:49], v[8:9], v[24:25]
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v34, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v35, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
-; GFX940-NEXT:    v_max_f64 v[50:51], v[10:11], v[26:27]
-; GFX940-NEXT:    v_max_f64 v[52:53], v[12:13], v[28:29]
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v36, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v37, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[16:17], v[14:15], v[30:31]
-; GFX940-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[24:25]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v8, v48, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v9, v49, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v10, v50, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v11, v51, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v12, v52, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v13, v53, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v15, v17, v54, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v8f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX900-NEXT:    v_max_f64 v[32:33], v[2:3], v[18:19]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
+; GFX900-NEXT:    v_max_f64 v[18:19], v[4:5], v[20:21]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[0:1], v[16:17]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v34, 0x7ff80000
+; GFX900-NEXT:    v_max_f64 v[20:21], v[6:7], v[22:23]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23]
+; GFX900-NEXT:    v_max_f64 v[16:17], v[8:9], v[24:25]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25]
+; GFX900-NEXT:    v_max_f64 v[22:23], v[10:11], v[26:27]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27]
+; GFX900-NEXT:    v_max_f64 v[24:25], v[12:13], v[28:29]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29]
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v3, v34, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v32, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v33, v34, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v18, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v19, v34, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v20, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v21, v34, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v8, v16, 0, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v9, v17, v34, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v22, 0, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v23, v34, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v24, 0, s[14:15]
+; GFX900-NEXT:    v_cndmask_b32_e64 v13, v25, v34, s[14:15]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_max_f64 v[18:19], v[14:15], v[30:31]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v18, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v19, v34, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v8f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_mov_b32_e32 v54, 0x7ff80000
+; GFX950-NEXT:    v_max_f64 v[32:33], v[0:1], v[16:17]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
+; GFX950-NEXT:    v_max_f64 v[34:35], v[2:3], v[18:19]
+; GFX950-NEXT:    v_max_f64 v[36:37], v[4:5], v[20:21]
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v32, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v33, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
+; GFX950-NEXT:    v_max_f64 v[38:39], v[6:7], v[22:23]
+; GFX950-NEXT:    v_max_f64 v[48:49], v[8:9], v[24:25]
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v34, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v35, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
+; GFX950-NEXT:    v_max_f64 v[50:51], v[10:11], v[26:27]
+; GFX950-NEXT:    v_max_f64 v[52:53], v[12:13], v[28:29]
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v36, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v37, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[16:17], v[14:15], v[30:31]
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[24:25]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v48, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v49, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v50, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v51, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v52, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v53, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v17, v54, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v8f64:
 ; GFX10:       ; %bb.0:
@@ -2332,295 +2273,295 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v16f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v34, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v34, s35, 3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[8:9]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[8:9], v[8:9], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, 0, s[10:11]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[10:11], v[10:11], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, 0, s[12:13]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[12:13], v[12:13], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, 0, s[14:15]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[14:15], v[14:15], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, 0, s[16:17]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[16:17], v[16:17], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, 0, s[18:19]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[18:19], v[18:19], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    v_cndmask_b32_e64 v18, v18, 0, s[20:21]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[20:21], v[20:21], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, 0, s[22:23]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[22:23], v[22:23], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[24:25]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[24:25], v[24:25], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    v_cndmask_b32_e64 v24, v24, 0, s[26:27]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[26:27], v[26:27], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
-; GFX9-NEXT:    v_max_f64 v[28:29], v[28:29], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
-; GFX9-NEXT:    v_max_f64 v[30:31], v[30:31], v[32:33]
-; GFX9-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v32, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v32, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v32, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v32, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v32, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v32, s[16:17]
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v17, v32, s[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, v19, v32, s[20:21]
-; GFX9-NEXT:    v_cndmask_b32_e64 v21, v21, v32, s[22:23]
-; GFX9-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
-; GFX9-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
-; GFX9-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
-; GFX9-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
-; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
-; GFX9-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v34, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_maximum_v16f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_accvgpr_write_b32 a1, v40 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a2, v41 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a3, v42 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a4, v43 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a5, v44 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a6, v45 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a7, v46 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a8, v47 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a9, v56 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a10, v57 ; Reload Reuse
-; GFX940-NEXT:    scratch_load_dword v37, off, s32 offset:16
-; GFX940-NEXT:    scratch_load_dword v36, off, s32 offset:12
-; GFX940-NEXT:    scratch_load_dword v39, off, s32 offset:24
-; GFX940-NEXT:    scratch_load_dword v38, off, s32 offset:20
-; GFX940-NEXT:    scratch_load_dword v49, off, s32 offset:32
-; GFX940-NEXT:    scratch_load_dword v48, off, s32 offset:28
-; GFX940-NEXT:    scratch_load_dword v57, off, s32 offset:8
-; GFX940-NEXT:    scratch_load_dword v56, off, s32 offset:4
-; GFX940-NEXT:    scratch_load_dword v47, off, s32 offset:40
-; GFX940-NEXT:    scratch_load_dword v46, off, s32 offset:36
-; GFX940-NEXT:    scratch_load_dword v45, off, s32 offset:48
-; GFX940-NEXT:    scratch_load_dword v44, off, s32 offset:44
-; GFX940-NEXT:    scratch_load_dword v43, off, s32 offset:56
-; GFX940-NEXT:    scratch_load_dword v42, off, s32 offset:52
-; GFX940-NEXT:    scratch_load_dword v41, off, s32 offset:64
-; GFX940-NEXT:    scratch_load_dword v40, off, s32 offset:60
-; GFX940-NEXT:    scratch_load_dword v55, off, s32 offset:72
-; GFX940-NEXT:    scratch_load_dword v54, off, s32 offset:68
-; GFX940-NEXT:    scratch_load_dword v53, off, s32 offset:80
-; GFX940-NEXT:    scratch_load_dword v52, off, s32 offset:76
-; GFX940-NEXT:    scratch_load_dword v51, off, s32 offset:88
-; GFX940-NEXT:    scratch_load_dword v50, off, s32 offset:84
-; GFX940-NEXT:    scratch_load_dword v35, off, s32 offset:96
-; GFX940-NEXT:    scratch_load_dword v34, off, s32 offset:92
-; GFX940-NEXT:    scratch_load_dword v31, off, s32
-; GFX940-NEXT:    scratch_load_dword v33, off, s32 offset:104
-; GFX940-NEXT:    scratch_load_dword v32, off, s32 offset:100
-; GFX940-NEXT:    v_accvgpr_write_b32 a11, v58 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a12, v59 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a13, v60 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a14, v61 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a15, v62 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a16, v63 ; Reload Reuse
-; GFX940-NEXT:    s_waitcnt vmcnt(25)
-; GFX940-NEXT:    v_max_f64 v[58:59], v[2:3], v[36:37]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[36:37]
-; GFX940-NEXT:    scratch_load_dword v37, off, s32 offset:112
-; GFX940-NEXT:    scratch_load_dword v36, off, s32 offset:108
-; GFX940-NEXT:    s_waitcnt vmcnt(25)
-; GFX940-NEXT:    v_max_f64 v[60:61], v[4:5], v[38:39]
-; GFX940-NEXT:    v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39]
-; GFX940-NEXT:    scratch_load_dword v39, off, s32 offset:120
-; GFX940-NEXT:    scratch_load_dword v38, off, s32 offset:116
-; GFX940-NEXT:    s_waitcnt vmcnt(25)
-; GFX940-NEXT:    v_max_f64 v[62:63], v[6:7], v[48:49]
-; GFX940-NEXT:    v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49]
-; GFX940-NEXT:    scratch_load_dword v49, off, s32 offset:128
-; GFX940-NEXT:    scratch_load_dword v48, off, s32 offset:124
-; GFX940-NEXT:    s_waitcnt vmcnt(25)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[0:1], v[56:57]
-; GFX940-NEXT:    v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57]
-; GFX940-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
-; GFX940-NEXT:    s_waitcnt vmcnt(23)
-; GFX940-NEXT:    v_max_f64 v[56:57], v[8:9], v[46:47]
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s[4:5]
-; GFX940-NEXT:    v_accvgpr_write_b32 a0, v1
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v3, v0, s[4:5]
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v58, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v59, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[46:47]
-; GFX940-NEXT:    s_waitcnt vmcnt(21)
-; GFX940-NEXT:    v_max_f64 v[46:47], v[10:11], v[44:45]
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v60, 0, s[0:1]
-; GFX940-NEXT:    v_cndmask_b32_e64 v8, v56, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v9, v57, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[44:45]
-; GFX940-NEXT:    s_waitcnt vmcnt(19)
-; GFX940-NEXT:    v_max_f64 v[44:45], v[12:13], v[42:43]
-; GFX940-NEXT:    v_cndmask_b32_e64 v5, v61, v0, s[0:1]
-; GFX940-NEXT:    v_cndmask_b32_e64 v10, v46, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v11, v47, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[42:43]
-; GFX940-NEXT:    s_waitcnt vmcnt(17)
-; GFX940-NEXT:    v_max_f64 v[42:43], v[14:15], v[40:41]
-; GFX940-NEXT:    v_cndmask_b32_e64 v6, v62, 0, s[2:3]
-; GFX940-NEXT:    v_cndmask_b32_e64 v12, v44, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v13, v45, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
-; GFX940-NEXT:    s_waitcnt vmcnt(15)
-; GFX940-NEXT:    v_max_f64 v[40:41], v[16:17], v[54:55]
-; GFX940-NEXT:    v_cndmask_b32_e64 v7, v63, v0, s[2:3]
-; GFX940-NEXT:    v_cndmask_b32_e64 v14, v42, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v15, v43, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[54:55]
-; GFX940-NEXT:    s_waitcnt vmcnt(13)
-; GFX940-NEXT:    v_max_f64 v[54:55], v[18:19], v[52:53]
-; GFX940-NEXT:    v_accvgpr_read_b32 v63, a16 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v16, v40, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v17, v41, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[52:53]
-; GFX940-NEXT:    s_waitcnt vmcnt(11)
-; GFX940-NEXT:    v_max_f64 v[52:53], v[20:21], v[50:51]
-; GFX940-NEXT:    v_accvgpr_read_b32 v62, a15 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v18, v54, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v19, v55, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[50:51]
-; GFX940-NEXT:    s_waitcnt vmcnt(9)
-; GFX940-NEXT:    v_max_f64 v[50:51], v[22:23], v[34:35]
-; GFX940-NEXT:    v_accvgpr_read_b32 v61, a14 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v20, v52, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v21, v53, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[34:35]
-; GFX940-NEXT:    s_waitcnt vmcnt(6)
-; GFX940-NEXT:    v_max_f64 v[34:35], v[24:25], v[32:33]
-; GFX940-NEXT:    v_accvgpr_read_b32 v60, a13 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v22, v50, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v23, v51, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[32:33]
-; GFX940-NEXT:    v_accvgpr_read_b32 v59, a12 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v58, a11 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v24, v34, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v25, v35, v0, vcc
-; GFX940-NEXT:    v_accvgpr_read_b32 v57, a10 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v56, a9 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v47, a8 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v46, a7 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v45, a6 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v44, a5 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v43, a4 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v42, a3 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v41, a2 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v40, a1 ; Reload Reuse
-; GFX940-NEXT:    s_waitcnt vmcnt(4)
-; GFX940-NEXT:    v_max_f64 v[32:33], v[26:27], v[36:37]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[36:37]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v26, v32, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v27, v33, v0, vcc
-; GFX940-NEXT:    s_waitcnt vmcnt(2)
-; GFX940-NEXT:    v_max_f64 v[32:33], v[28:29], v[38:39]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[38:39]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v28, v32, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v29, v33, v0, vcc
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[32:33], v[30:31], v[48:49]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[48:49]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v30, v32, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v31, v33, v0, vcc
-; GFX940-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v16f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    v_writelane_b32 v34, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v34, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v34, s34, 2
+; GFX900-NEXT:    v_writelane_b32 v34, s35, 3
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[0:1], v[0:1], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[2:3], v[2:3], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[4:5], v[4:5], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[6:7]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[6:7], v[6:7], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[8:9]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[8:9], v[8:9], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX900-NEXT:    v_cndmask_b32_e64 v8, v8, 0, s[10:11]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[10:11], v[10:11], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v10, 0, s[12:13]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[12:13], v[12:13], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v12, 0, s[14:15]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[14:15], v[14:15], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v14, 0, s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[16:17], v[16:17], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX900-NEXT:    v_cndmask_b32_e64 v16, v16, 0, s[18:19]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[18:19], v[18:19], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX900-NEXT:    v_cndmask_b32_e64 v18, v18, 0, s[20:21]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[20:21], v[20:21], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX900-NEXT:    v_cndmask_b32_e64 v20, v20, 0, s[22:23]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[22:23], v[22:23], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX900-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[24:25]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[24:25], v[24:25], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX900-NEXT:    v_cndmask_b32_e64 v24, v24, 0, s[26:27]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[26:27], v[26:27], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX900-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
+; GFX900-NEXT:    v_max_f64 v[28:29], v[28:29], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX900-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
+; GFX900-NEXT:    v_max_f64 v[30:31], v[30:31], v[32:33]
+; GFX900-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v3, v32, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v5, v32, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v7, v32, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v9, v9, v32, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v11, v32, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[14:15]
+; GFX900-NEXT:    v_cndmask_b32_e64 v15, v15, v32, s[16:17]
+; GFX900-NEXT:    v_cndmask_b32_e64 v17, v17, v32, s[18:19]
+; GFX900-NEXT:    v_cndmask_b32_e64 v19, v19, v32, s[20:21]
+; GFX900-NEXT:    v_cndmask_b32_e64 v21, v21, v32, s[22:23]
+; GFX900-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
+; GFX900-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
+; GFX900-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
+; GFX900-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
+; GFX900-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
+; GFX900-NEXT:    v_readlane_b32 s35, v34, 3
+; GFX900-NEXT:    v_readlane_b32 s34, v34, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v34, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v34, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v16f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v40 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a2, v41 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v42 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a4, v43 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a5, v44 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v45 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a8, v47 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a9, v56 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a10, v57 ; Reload Reuse
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:16
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:12
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:24
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:32
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:28
+; GFX950-NEXT:    scratch_load_dword v57, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v56, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v47, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v46, off, s32 offset:36
+; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:44
+; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:52
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:72
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:68
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:80
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:76
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:88
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:84
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:96
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:104
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:100
+; GFX950-NEXT:    v_accvgpr_write_b32 a11, v58 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a12, v59 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a13, v60 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a14, v61 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a15, v62 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a16, v63 ; Reload Reuse
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_max_f64 v[58:59], v[2:3], v[36:37]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[36:37]
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:112
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:108
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_max_f64 v[60:61], v[4:5], v[38:39]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39]
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:120
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:116
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_max_f64 v[62:63], v[6:7], v[48:49]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49]
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:128
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:124
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_max_f64 v[2:3], v[0:1], v[56:57]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57]
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_max_f64 v[56:57], v[8:9], v[46:47]
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s[4:5]
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v3, v0, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v58, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v59, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[46:47]
+; GFX950-NEXT:    s_waitcnt vmcnt(21)
+; GFX950-NEXT:    v_max_f64 v[46:47], v[10:11], v[44:45]
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v60, 0, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v56, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v57, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[44:45]
+; GFX950-NEXT:    s_waitcnt vmcnt(19)
+; GFX950-NEXT:    v_max_f64 v[44:45], v[12:13], v[42:43]
+; GFX950-NEXT:    v_cndmask_b32_e64 v5, v61, v0, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v46, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v47, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[42:43]
+; GFX950-NEXT:    s_waitcnt vmcnt(17)
+; GFX950-NEXT:    v_max_f64 v[42:43], v[14:15], v[40:41]
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v62, 0, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v44, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v45, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
+; GFX950-NEXT:    s_waitcnt vmcnt(15)
+; GFX950-NEXT:    v_max_f64 v[40:41], v[16:17], v[54:55]
+; GFX950-NEXT:    v_cndmask_b32_e64 v7, v63, v0, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v42, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v43, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[54:55]
+; GFX950-NEXT:    s_waitcnt vmcnt(13)
+; GFX950-NEXT:    v_max_f64 v[54:55], v[18:19], v[52:53]
+; GFX950-NEXT:    v_accvgpr_read_b32 v63, a16 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v16, v40, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v41, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[52:53]
+; GFX950-NEXT:    s_waitcnt vmcnt(11)
+; GFX950-NEXT:    v_max_f64 v[52:53], v[20:21], v[50:51]
+; GFX950-NEXT:    v_accvgpr_read_b32 v62, a15 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v18, v54, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v55, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[50:51]
+; GFX950-NEXT:    s_waitcnt vmcnt(9)
+; GFX950-NEXT:    v_max_f64 v[50:51], v[22:23], v[34:35]
+; GFX950-NEXT:    v_accvgpr_read_b32 v61, a14 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v20, v52, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v53, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[34:35]
+; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    v_max_f64 v[34:35], v[24:25], v[32:33]
+; GFX950-NEXT:    v_accvgpr_read_b32 v60, a13 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v22, v50, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v51, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[32:33]
+; GFX950-NEXT:    v_accvgpr_read_b32 v59, a12 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v58, a11 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v24, v34, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v35, v0, vcc
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v47, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v46, a7 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a6 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v41, a2 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a1 ; Reload Reuse
+; GFX950-NEXT:    s_waitcnt vmcnt(4)
+; GFX950-NEXT:    v_max_f64 v[32:33], v[26:27], v[36:37]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[36:37]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v26, v32, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v33, v0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
+; GFX950-NEXT:    v_max_f64 v[32:33], v[28:29], v[38:39]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[38:39]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v28, v32, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v29, v33, v0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_max_f64 v[32:33], v[30:31], v[48:49]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[48:49]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v30, v32, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v33, v0, vcc
+; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v16f64:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index a74043378a259..329a85f91c251 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -2,7 +2,8 @@
 ; xUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
@@ -17,24 +18,24 @@ define half @v_minimum_f16(half %src0, half %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16:
 ; GFX10:       ; %bb.0:
@@ -79,12 +80,6 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
 ; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_f16__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_f16__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,24 +115,24 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f16__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f16__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f16__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f16__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nsz:
 ; GFX10:       ; %bb.0:
@@ -182,12 +177,6 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_f16__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -224,26 +213,26 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f16__nnan_src0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f16__nnan_src0:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX940-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f16__nnan_src0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX900-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f16__nnan_src0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX950-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan_src0:
 ; GFX10:       ; %bb.0:
@@ -291,26 +280,26 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f16__nnan_src1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f16__nnan_src1:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX940-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f16__nnan_src1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX900-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f16__nnan_src1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX950-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan_src1:
 ; GFX10:       ; %bb.0:
@@ -362,34 +351,34 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_minimum_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s17
-; GFX9-NEXT:    v_min_f16_e32 v1, s16, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v0
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_minimum_f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_min_f16_e32 v1, s0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v0
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_minimum_f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s17
+; GFX900-NEXT:    v_min_f16_e32 v1, s16, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, s16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_minimum_f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v0
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_minimum_f16:
 ; GFX10:       ; %bb.0:
@@ -456,35 +445,35 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v2f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f16:
 ; GFX10:       ; %bb.0:
@@ -542,12 +531,6 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
 ; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v2f16__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v2f16__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -590,35 +573,35 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f16__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v2f16__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f16__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f16__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f16__nsz:
 ; GFX10:       ; %bb.0:
@@ -676,12 +659,6 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
 ; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v2f16__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v2f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -729,50 +706,50 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_minimum_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s17
-; GFX9-NEXT:    v_mov_b32_e32 v1, s17
-; GFX9-NEXT:    s_lshr_b32 s4, s17, 16
-; GFX9-NEXT:    v_pk_min_f16 v1, s16, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s16, v0
-; GFX9-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s5, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v0
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_minimum_v2f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX940-NEXT:    v_pk_min_f16 v1, s0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
-; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX940-NEXT:    v_mov_b32_e32 v3, s1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
-; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX940-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v0
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_minimum_v2f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s17
+; GFX900-NEXT:    v_mov_b32_e32 v1, s17
+; GFX900-NEXT:    s_lshr_b32 s4, s17, 16
+; GFX900-NEXT:    v_pk_min_f16 v1, s16, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, s16, v0
+; GFX900-NEXT:    s_lshr_b32 s5, s16, 16
+; GFX900-NEXT:    v_mov_b32_e32 v3, s4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, s5, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_minimum_v2f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_mov_b32_e32 v1, s1
+; GFX950-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX950-NEXT:    v_pk_min_f16 v1, s0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX950-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX950-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX950-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v0
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_minimum_v2f16:
 ; GFX10:       ; %bb.0:
@@ -850,41 +827,41 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v3f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX940-NEXT:    v_pk_min_f16 v3, v0, v2
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX950-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f16:
 ; GFX10:       ; %bb.0:
@@ -952,13 +929,6 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v3f16__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX940-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v3f16__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1007,41 +977,41 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f16__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v3f16__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX940-NEXT:    v_pk_min_f16 v3, v0, v2
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f16__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f16__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX950-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f16__nsz:
 ; GFX10:       ; %bb.0:
@@ -1109,13 +1079,6 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v3f16__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX940-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v3f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1171,51 +1134,51 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v4f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v3, v0, v2
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX940-NEXT:    v_perm_b32 v1, v1, v6, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX950-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f16:
 ; GFX10:       ; %bb.0:
@@ -1294,13 +1257,6 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v4f16__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX940-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v4f16__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1356,51 +1312,51 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f16__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v4f16__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v3, v0, v2
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX940-NEXT:    v_perm_b32 v1, v1, v6, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f16__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f16__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX950-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f16__nsz:
 ; GFX10:       ; %bb.0:
@@ -1479,13 +1435,6 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v4f16__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX940-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v4f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1561,83 +1510,83 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v8f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v8, v3, v7
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
-; GFX9-NEXT:    v_pk_min_f16 v7, v2, v6
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
-; GFX9-NEXT:    v_pk_min_f16 v6, v1, v5
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
-; GFX9-NEXT:    v_pk_min_f16 v5, v0, v4
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v7, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v10, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v8f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v8, v3, v7
-; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v7, v2, v6
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
-; GFX940-NEXT:    v_perm_b32 v3, v3, v10, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v6, v1, v5
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
-; GFX940-NEXT:    v_perm_b32 v2, v2, v8, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v5, v0, v4
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
-; GFX940-NEXT:    v_perm_b32 v1, v1, v7, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v6, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v8f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v8, v3, v7
+; GFX900-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX900-NEXT:    v_pk_min_f16 v7, v2, v6
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX900-NEXT:    v_pk_min_f16 v6, v1, v5
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX900-NEXT:    v_pk_min_f16 v5, v0, v4
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v7, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v10, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v8f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v8, v3, v7
+; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v7, v2, v6
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX950-NEXT:    v_perm_b32 v3, v3, v10, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v6, v1, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX950-NEXT:    v_perm_b32 v2, v2, v8, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v5, v0, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX950-NEXT:    v_perm_b32 v1, v1, v7, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v6, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v8f16:
 ; GFX10:       ; %bb.0:
@@ -1818,147 +1767,147 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v16f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v16, v7, v15
-; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7e00
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
-; GFX9-NEXT:    v_pk_min_f16 v15, v6, v14
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
-; GFX9-NEXT:    v_pk_min_f16 v14, v5, v13
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
-; GFX9-NEXT:    v_pk_min_f16 v13, v4, v12
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
-; GFX9-NEXT:    v_pk_min_f16 v12, v3, v11
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
-; GFX9-NEXT:    v_pk_min_f16 v11, v2, v10
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
-; GFX9-NEXT:    v_pk_min_f16 v10, v1, v9
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
-; GFX9-NEXT:    v_pk_min_f16 v9, v0, v8
-; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v10, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v12, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v13, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v14, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v15, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v18, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v16f16:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_pk_min_f16 v16, v7, v15
-; GFX940-NEXT:    v_mov_b32_e32 v17, 0x7e00
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
-; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v15, v6, v14
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
-; GFX940-NEXT:    v_perm_b32 v7, v7, v18, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v14, v5, v13
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
-; GFX940-NEXT:    v_perm_b32 v6, v6, v16, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v13, v4, v12
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
-; GFX940-NEXT:    v_perm_b32 v5, v5, v15, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v12, v3, v11
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
-; GFX940-NEXT:    v_perm_b32 v4, v4, v14, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v11, v2, v10
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
-; GFX940-NEXT:    v_perm_b32 v3, v3, v13, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v10, v1, v9
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
-; GFX940-NEXT:    v_perm_b32 v2, v2, v12, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    v_pk_min_f16 v9, v0, v8
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
-; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
-; GFX940-NEXT:    v_perm_b32 v1, v1, v11, s0
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
-; GFX940-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
-; GFX940-NEXT:    v_perm_b32 v0, v0, v10, s0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v16f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_min_f16 v16, v7, v15
+; GFX900-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX900-NEXT:    v_pk_min_f16 v15, v6, v14
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
+; GFX900-NEXT:    v_pk_min_f16 v14, v5, v13
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GFX900-NEXT:    v_pk_min_f16 v13, v4, v12
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
+; GFX900-NEXT:    v_pk_min_f16 v12, v3, v11
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX900-NEXT:    v_pk_min_f16 v11, v2, v10
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
+; GFX900-NEXT:    v_pk_min_f16 v10, v1, v9
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
+; GFX900-NEXT:    v_pk_min_f16 v9, v0, v8
+; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v10, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v11, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v12, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v13, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v14, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v15, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v18, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v16f16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_pk_min_f16 v16, v7, v15
+; GFX950-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v15, v6, v14
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX950-NEXT:    v_perm_b32 v7, v7, v18, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v14, v5, v13
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX950-NEXT:    v_perm_b32 v6, v6, v16, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v13, v4, v12
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX950-NEXT:    v_perm_b32 v5, v5, v15, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v12, v3, v11
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX950-NEXT:    v_perm_b32 v4, v4, v14, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v11, v2, v10
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX950-NEXT:    v_perm_b32 v3, v3, v13, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v10, v1, v9
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX950-NEXT:    v_perm_b32 v2, v2, v12, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_pk_min_f16 v9, v0, v8
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
+; GFX950-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX950-NEXT:    v_perm_b32 v1, v1, v11, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v0, v10, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v16f16:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 2b3041290b586..2614fb3bf9f73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
@@ -26,24 +27,24 @@ define float @v_minimum_f32(float %src0, float %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32:
 ; GFX10:       ; %bb.0:
@@ -94,12 +95,6 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) {
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_f32__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_f32__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -144,24 +139,24 @@ define float @v_minimum_f32__nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f32__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f32__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f32__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f32__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nsz:
 ; GFX10:       ; %bb.0:
@@ -212,12 +207,6 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_f32__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -264,26 +253,26 @@ define float @v_minimum_f32__nnan_src0(float %arg0, float %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f32__nnan_src0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f32__nnan_src0:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f32__nnan_src0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX900-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f32__nnan_src0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX950-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan_src0:
 ; GFX10:       ; %bb.0:
@@ -341,26 +330,26 @@ define float @v_minimum_f32__nnan_src1(float %src0, float %arg1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f32__nnan_src1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f32__nnan_src1:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f32__nnan_src1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX900-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f32__nnan_src1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX950-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan_src1:
 ; GFX10:       ; %bb.0:
@@ -424,32 +413,32 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_minimum_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s17
-; GFX9-NEXT:    v_min_f32_e32 v1, s16, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v0
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_minimum_f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_min_f32_e32 v1, s0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v0
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_minimum_f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s17
+; GFX900-NEXT:    v_min_f32_e32 v1, s16, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, s16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_minimum_f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v0
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_minimum_f32:
 ; GFX10:       ; %bb.0:
@@ -517,31 +506,31 @@ define <2 x float> @v_minimum_v2f32(<2 x float> %src0, <2 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v3
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v2f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX940-NEXT:    v_min_f32_e32 v2, v1, v3
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX900-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX950-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32:
 ; GFX10:       ; %bb.0:
@@ -601,13 +590,6 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v2f32__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX940-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -660,31 +642,31 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f32__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v3
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v2f32__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX940-NEXT:    v_min_f32_e32 v2, v1, v3
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f32__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX900-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f32__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX950-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nsz:
 ; GFX10:       ; %bb.0:
@@ -744,13 +726,6 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v2f32__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX940-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -813,40 +788,40 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_minimum_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s19
-; GFX9-NEXT:    v_min_f32_e32 v1, s17, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s17, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s18
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, s16, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v[0:1]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_minimum_v2f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s3
-; GFX940-NEXT:    v_min_f32_e32 v1, s1, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s1, v0
-; GFX940-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NEXT:    v_min_f32_e32 v3, s0, v0
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v[0:1]
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_minimum_v2f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s19
+; GFX900-NEXT:    v_min_f32_e32 v1, s17, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, s17, v0
+; GFX900-NEXT:    v_mov_b32_e32 v0, s18
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX900-NEXT:    v_min_f32_e32 v3, s16, v0
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, s16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_minimum_v2f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s3
+; GFX950-NEXT:    v_min_f32_e32 v1, s1, v0
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s1, v0
+; GFX950-NEXT:    v_mov_b32_e32 v0, s2
+; GFX950-NEXT:    v_min_f32_e32 v3, s0, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v[0:1]
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_minimum_v2f32:
 ; GFX10:       ; %bb.0:
@@ -927,38 +902,38 @@ define <3 x float> @v_minimum_v3f32(<3 x float> %src0, <3 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v6, v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v1, v4
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v2, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v3f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v6, v0, v3
-; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX940-NEXT:    v_min_f32_e32 v3, v1, v4
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX940-NEXT:    v_min_f32_e32 v3, v2, v5
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX900-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX900-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX950-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX950-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX950-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32:
 ; GFX10:       ; %bb.0:
@@ -1028,14 +1003,6 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v3f32__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX940-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1097,38 +1064,38 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f32__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v6, v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v1, v4
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v2, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v3f32__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v6, v0, v3
-; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX940-NEXT:    v_min_f32_e32 v3, v1, v4
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX940-NEXT:    v_min_f32_e32 v3, v2, v5
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f32__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX900-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX900-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f32__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX950-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX950-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX950-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nsz:
 ; GFX10:       ; %bb.0:
@@ -1198,14 +1165,6 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v3f32__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX940-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1273,45 +1232,45 @@ define <4 x float> @v_minimum_v4f32(<4 x float> %src0, <4 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v8, v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT:    v_min_f32_e32 v4, v1, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v4, v2, v6
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v4, v3, v7
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v4f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v8, v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX940-NEXT:    v_min_f32_e32 v4, v1, v5
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX940-NEXT:    v_min_f32_e32 v4, v2, v6
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v7
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v8, v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX900-NEXT:    v_min_f32_e32 v4, v1, v5
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX900-NEXT:    v_min_f32_e32 v4, v2, v6
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX900-NEXT:    v_min_f32_e32 v4, v3, v7
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v8, v0, v4
+; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX950-NEXT:    v_min_f32_e32 v4, v1, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX950-NEXT:    v_min_f32_e32 v4, v2, v6
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX950-NEXT:    v_min_f32_e32 v4, v3, v7
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32:
 ; GFX10:       ; %bb.0:
@@ -1391,15 +1350,6 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v4f32__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX940-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1469,45 +1419,45 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f32__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v8, v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX9-NEXT:    v_min_f32_e32 v4, v1, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v4, v2, v6
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v4, v3, v7
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v4f32__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v8, v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX940-NEXT:    v_min_f32_e32 v4, v1, v5
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX940-NEXT:    v_min_f32_e32 v4, v2, v6
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v7
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f32__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v8, v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX900-NEXT:    v_min_f32_e32 v4, v1, v5
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX900-NEXT:    v_min_f32_e32 v4, v2, v6
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX900-NEXT:    v_min_f32_e32 v4, v3, v7
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f32__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v8, v0, v4
+; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX950-NEXT:    v_min_f32_e32 v4, v1, v5
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX950-NEXT:    v_min_f32_e32 v4, v2, v6
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX950-NEXT:    v_min_f32_e32 v4, v3, v7
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nsz:
 ; GFX10:       ; %bb.0:
@@ -1587,15 +1537,6 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v4f32__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX940-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX940-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1689,73 +1630,73 @@ define <8 x float> @v_minimum_v8f32(<8 x float> %src0, <8 x float> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v8f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v16, v0, v8
-; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
-; GFX9-NEXT:    v_min_f32_e32 v8, v1, v9
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
-; GFX9-NEXT:    v_min_f32_e32 v8, v2, v10
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
-; GFX9-NEXT:    v_min_f32_e32 v8, v3, v11
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
-; GFX9-NEXT:    v_min_f32_e32 v8, v4, v12
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
-; GFX9-NEXT:    v_min_f32_e32 v8, v5, v13
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
-; GFX9-NEXT:    v_min_f32_e32 v8, v6, v14
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
-; GFX9-NEXT:    v_min_f32_e32 v8, v7, v15
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v8f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v16, v0, v8
-; GFX940-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
-; GFX940-NEXT:    v_min_f32_e32 v8, v1, v9
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
-; GFX940-NEXT:    v_min_f32_e32 v8, v2, v10
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
-; GFX940-NEXT:    v_min_f32_e32 v8, v3, v11
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
-; GFX940-NEXT:    v_min_f32_e32 v8, v4, v12
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
-; GFX940-NEXT:    v_min_f32_e32 v8, v5, v13
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
-; GFX940-NEXT:    v_min_f32_e32 v8, v6, v14
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
-; GFX940-NEXT:    v_min_f32_e32 v8, v7, v15
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v8f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v16, v0, v8
+; GFX900-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
+; GFX900-NEXT:    v_min_f32_e32 v8, v1, v9
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
+; GFX900-NEXT:    v_min_f32_e32 v8, v2, v10
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
+; GFX900-NEXT:    v_min_f32_e32 v8, v3, v11
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
+; GFX900-NEXT:    v_min_f32_e32 v8, v4, v12
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
+; GFX900-NEXT:    v_min_f32_e32 v8, v5, v13
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
+; GFX900-NEXT:    v_min_f32_e32 v8, v6, v14
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
+; GFX900-NEXT:    v_min_f32_e32 v8, v7, v15
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v8f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v16, v0, v8
+; GFX950-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX950-NEXT:    v_min_f32_e32 v8, v1, v9
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
+; GFX950-NEXT:    v_min_f32_e32 v8, v2, v10
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
+; GFX950-NEXT:    v_min_f32_e32 v8, v3, v11
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
+; GFX950-NEXT:    v_min_f32_e32 v8, v4, v12
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
+; GFX950-NEXT:    v_min_f32_e32 v8, v5, v13
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
+; GFX950-NEXT:    v_min_f32_e32 v8, v6, v14
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
+; GFX950-NEXT:    v_min_f32_e32 v8, v7, v15
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v8f32:
 ; GFX10:       ; %bb.0:
@@ -1968,136 +1909,136 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v16f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[16:17], v0, v16
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX9-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
-; GFX9-NEXT:    v_min_f32_e32 v18, v13, v29
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[28:29], v13, v29
-; GFX9-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v6, v22
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[14:15], v7, v23
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[18:19], v8, v24
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[20:21], v9, v25
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[22:23], v10, v26
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[24:25], v11, v27
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[26:27], v12, v28
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_min_f32_e32 v19, v14, v30
-; GFX9-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v17, v18, s[28:29]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v17, v0, s[16:17]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v17, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v17, v4, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v17, v6, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v17, v7, s[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v17, v8, s[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[20:21]
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v17, v10, s[22:23]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v17, v11, s[24:25]
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v17, v12, s[26:27]
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v17, v19, s[30:31]
-; GFX9-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v18, v15, v16
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v16f32:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v31, off, s32
-; GFX940-NEXT:    v_mov_b32_e32 v32, 0x7fc00000
-; GFX940-NEXT:    v_min_f32_e32 v33, v0, v16
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
-; GFX940-NEXT:    v_min_f32_e32 v34, v1, v17
-; GFX940-NEXT:    v_min_f32_e32 v35, v2, v18
-; GFX940-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
-; GFX940-NEXT:    v_min_f32_e32 v36, v3, v19
-; GFX940-NEXT:    v_min_f32_e32 v37, v4, v20
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v32, v34, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
-; GFX940-NEXT:    v_min_f32_e32 v38, v5, v21
-; GFX940-NEXT:    v_min_f32_e32 v39, v6, v22
-; GFX940-NEXT:    v_cndmask_b32_e32 v2, v32, v35, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
-; GFX940-NEXT:    v_min_f32_e32 v48, v7, v23
-; GFX940-NEXT:    v_min_f32_e32 v49, v8, v24
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v32, v36, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
-; GFX940-NEXT:    v_min_f32_e32 v50, v9, v25
-; GFX940-NEXT:    v_min_f32_e32 v51, v10, v26
-; GFX940-NEXT:    v_cndmask_b32_e32 v4, v32, v37, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
-; GFX940-NEXT:    v_min_f32_e32 v52, v11, v27
-; GFX940-NEXT:    v_min_f32_e32 v53, v12, v28
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v32, v38, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
-; GFX940-NEXT:    v_min_f32_e32 v54, v13, v29
-; GFX940-NEXT:    v_min_f32_e32 v55, v14, v30
-; GFX940-NEXT:    v_cndmask_b32_e32 v6, v32, v39, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_min_f32_e32 v16, v15, v31
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v32, v48, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v8, v32, v49, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v9, v32, v50, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v10, v32, v51, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v11, v32, v52, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v12, v32, v53, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v13, v32, v54, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v14, v32, v55, vcc
-; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v16f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[16:17], v0, v16
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX900-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX900-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v18
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX900-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX900-NEXT:    v_min_f32_e32 v18, v13, v29
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[28:29], v13, v29
+; GFX900-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v19
+; GFX900-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v20
+; GFX900-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v21
+; GFX900-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[12:13], v6, v22
+; GFX900-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[14:15], v7, v23
+; GFX900-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[18:19], v8, v24
+; GFX900-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[20:21], v9, v25
+; GFX900-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[22:23], v10, v26
+; GFX900-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[24:25], v11, v27
+; GFX900-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[26:27], v12, v28
+; GFX900-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX900-NEXT:    v_min_f32_e32 v19, v14, v30
+; GFX900-NEXT:    v_cmp_o_f32_e64 s[30:31], v14, v30
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v13, v17, v18, s[28:29]
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v17, v0, s[16:17]
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v17, v2, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v17, v4, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v17, v6, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v17, v7, s[14:15]
+; GFX900-NEXT:    v_cndmask_b32_e64 v8, v17, v8, s[18:19]
+; GFX900-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[20:21]
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v17, v10, s[22:23]
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v17, v11, s[24:25]
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v17, v12, s[26:27]
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v17, v19, s[30:31]
+; GFX900-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v18, v15, v16
+; GFX900-NEXT:    v_cmp_o_f32_e32 vcc, v15, v16
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v16f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_mov_b32_e32 v32, 0x7fc00000
+; GFX950-NEXT:    v_min_f32_e32 v33, v0, v16
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX950-NEXT:    v_min_f32_e32 v34, v1, v17
+; GFX950-NEXT:    v_min_f32_e32 v35, v2, v18
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX950-NEXT:    v_min_f32_e32 v36, v3, v19
+; GFX950-NEXT:    v_min_f32_e32 v37, v4, v20
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v32, v34, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX950-NEXT:    v_min_f32_e32 v38, v5, v21
+; GFX950-NEXT:    v_min_f32_e32 v39, v6, v22
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v32, v35, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX950-NEXT:    v_min_f32_e32 v48, v7, v23
+; GFX950-NEXT:    v_min_f32_e32 v49, v8, v24
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v32, v36, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX950-NEXT:    v_min_f32_e32 v50, v9, v25
+; GFX950-NEXT:    v_min_f32_e32 v51, v10, v26
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v32, v37, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX950-NEXT:    v_min_f32_e32 v52, v11, v27
+; GFX950-NEXT:    v_min_f32_e32 v53, v12, v28
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v32, v38, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX950-NEXT:    v_min_f32_e32 v54, v13, v29
+; GFX950-NEXT:    v_min_f32_e32 v55, v14, v30
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v32, v39, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_min_f32_e32 v16, v15, v31
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v32, v48, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v32, v49, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v32, v50, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v32, v51, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v32, v52, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v32, v53, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v32, v54, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v32, v55, vcc
+; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v16f32:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 567582c9f58ff..71fdd691a1512 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
@@ -28,26 +29,26 @@ define double @v_minimum_f64(double %src0, double %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f64:
 ; GFX10:       ; %bb.0:
@@ -100,12 +101,6 @@ define double @v_minimum_f64__nnan(double %src0, double %src1) {
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_f64__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_f64__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -152,26 +147,26 @@ define double @v_minimum_f64__nsz(double %src0, double %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f64__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f64__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f64__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f64__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f64__nsz:
 ; GFX10:       ; %bb.0:
@@ -224,12 +219,6 @@ define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) {
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_f64__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -278,28 +267,28 @@ define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f64__nnan_src0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f64__nnan_src0:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX940-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f64__nnan_src0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX900-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f64__nnan_src0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX950-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f64__nnan_src0:
 ; GFX10:       ; %bb.0:
@@ -362,28 +351,28 @@ define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f64__nnan_src1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_f64__nnan_src1:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX940-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f64__nnan_src1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX900-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f64__nnan_src1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX950-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f64__nnan_src1:
 ; GFX10:       ; %bb.0:
@@ -454,35 +443,35 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_minimum_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s18
-; GFX9-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-NEXT:    v_min_f64 v[2:3], s[16:17], v[0:1]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v[0:1]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_minimum_f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v[0:1]
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_minimum_f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s18
+; GFX900-NEXT:    v_mov_b32_e32 v1, s19
+; GFX900-NEXT:    v_min_f64 v[2:3], s[16:17], v[0:1]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_minimum_f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX950-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v[0:1]
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_minimum_f64:
 ; GFX10:       ; %bb.0:
@@ -555,35 +544,35 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v2f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX950-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64:
 ; GFX10:       ; %bb.0:
@@ -648,13 +637,6 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v2f64__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v2f64__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -712,35 +694,35 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f64__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v2f64__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f64__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f64__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX950-NEXT:    v_min_f64 v[4:5], v[2:3], v[6:7]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[6:7]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f64__nsz:
 ; GFX10:       ; %bb.0:
@@ -805,13 +787,6 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
 ; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v2f64__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v2f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -883,46 +858,46 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_minimum_v2f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s22
-; GFX9-NEXT:    v_mov_b32_e32 v4, s20
-; GFX9-NEXT:    v_mov_b32_e32 v1, s23
-; GFX9-NEXT:    v_mov_b32_e32 v5, s21
-; GFX9-NEXT:    v_min_f64 v[2:3], s[18:19], v[0:1]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
-; GFX9-NEXT:    v_min_f64 v[0:1], s[16:17], v[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v[0:3]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: s_minimum_v2f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
-; GFX940-NEXT:    v_min_f64 v[2:3], s[2:3], v[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, s[2:3], v[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
-; GFX940-NEXT:    v_min_f64 v[4:5], s[0:1], v[0:1]
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX940-NEXT:    ;;#ASMSTART
-; GFX940-NEXT:    ; use v[0:3]
-; GFX940-NEXT:    ;;#ASMEND
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_minimum_v2f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s22
+; GFX900-NEXT:    v_mov_b32_e32 v4, s20
+; GFX900-NEXT:    v_mov_b32_e32 v1, s23
+; GFX900-NEXT:    v_mov_b32_e32 v5, s21
+; GFX900-NEXT:    v_min_f64 v[2:3], s[18:19], v[0:1]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
+; GFX900-NEXT:    v_min_f64 v[0:1], s[16:17], v[4:5]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use v[0:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: s_minimum_v2f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT:    v_min_f64 v[2:3], s[2:3], v[0:1]
+; GFX950-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, s[2:3], v[0:1]
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT:    v_min_f64 v[4:5], s[0:1], v[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use v[0:3]
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_minimum_v2f64:
 ; GFX10:       ; %bb.0:
@@ -1012,44 +987,44 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[8:9], v[4:5], v[10:11]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v3f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX940-NEXT:    v_min_f64 v[6:7], v[4:5], v[10:11]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[8:9], v[4:5], v[10:11]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v8, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
+; GFX950-NEXT:    v_min_f64 v[6:7], v[4:5], v[10:11]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64:
 ; GFX10:       ; %bb.0:
@@ -1125,14 +1100,6 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v3f64__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v3f64__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1201,44 +1168,44 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f64__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[8:9], v[4:5], v[10:11]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v3f64__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
-; GFX940-NEXT:    v_min_f64 v[6:7], v[4:5], v[10:11]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f64__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[8:9], v[4:5], v[10:11]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v8, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f64__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[12:13], v[0:1], v[6:7]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[6:7]
+; GFX950-NEXT:    v_min_f64 v[6:7], v[2:3], v[8:9]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v12, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v13, v12, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc
+; GFX950-NEXT:    v_min_f64 v[6:7], v[4:5], v[10:11]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[10:11]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f64__nsz:
 ; GFX10:       ; %bb.0:
@@ -1314,14 +1281,6 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v3f64__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v3f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1398,53 +1357,53 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11]
-; GFX9-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13]
-; GFX9-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15]
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v4f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX940-NEXT:    v_min_f64 v[8:9], v[4:5], v[12:13]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX940-NEXT:    v_min_f64 v[8:9], v[6:7], v[14:15]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX900-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX900-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
+; GFX950-NEXT:    v_min_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
+; GFX950-NEXT:    v_min_f64 v[8:9], v[6:7], v[14:15]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64:
 ; GFX10:       ; %bb.0:
@@ -1532,15 +1491,6 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v4f64__nnan:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX940-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v4f64__nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1620,53 +1570,53 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f64__nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX9-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11]
-; GFX9-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13]
-; GFX9-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15]
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v4f64__nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
-; GFX940-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX940-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
-; GFX940-NEXT:    v_min_f64 v[8:9], v[4:5], v[12:13]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
-; GFX940-NEXT:    v_min_f64 v[8:9], v[6:7], v[14:15]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f64__nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX900-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX900-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX900-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[8:9]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f64__nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX950-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v16, 0x7ff80000
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[10:11]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v16, vcc
+; GFX950-NEXT:    v_min_f64 v[8:9], v[4:5], v[12:13]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[12:13]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v9, v16, vcc
+; GFX950-NEXT:    v_min_f64 v[8:9], v[6:7], v[14:15]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[14:15]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v9, v16, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f64__nsz:
 ; GFX10:       ; %bb.0:
@@ -1754,15 +1704,6 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
 ; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: v_minimum_v4f64__nnan_nsz:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX940-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: v_minimum_v4f64__nnan_nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1878,89 +1819,89 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v19, v34, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v8f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_min_f64 v[32:33], v[2:3], v[18:19]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
-; GFX9-NEXT:    v_min_f64 v[18:19], v[4:5], v[20:21]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[16:17]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v34, 0x7ff80000
-; GFX9-NEXT:    v_min_f64 v[20:21], v[6:7], v[22:23]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23]
-; GFX9-NEXT:    v_min_f64 v[16:17], v[8:9], v[24:25]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25]
-; GFX9-NEXT:    v_min_f64 v[22:23], v[10:11], v[26:27]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27]
-; GFX9-NEXT:    v_min_f64 v[24:25], v[12:13], v[28:29]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v34, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v32, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v33, v34, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v18, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v19, v34, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v20, 0, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v21, v34, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v16, 0, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v34, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v22, 0, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v23, v34, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v24, 0, s[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v25, v34, s[14:15]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_min_f64 v[18:19], v[14:15], v[30:31]
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v18, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v19, v34, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v8f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v31, off, s32
-; GFX940-NEXT:    v_mov_b32_e32 v54, 0x7ff80000
-; GFX940-NEXT:    v_min_f64 v[32:33], v[0:1], v[16:17]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
-; GFX940-NEXT:    v_min_f64 v[34:35], v[2:3], v[18:19]
-; GFX940-NEXT:    v_min_f64 v[36:37], v[4:5], v[20:21]
-; GFX940-NEXT:    v_cndmask_b32_e64 v0, v32, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v1, v33, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
-; GFX940-NEXT:    v_min_f64 v[38:39], v[6:7], v[22:23]
-; GFX940-NEXT:    v_min_f64 v[48:49], v[8:9], v[24:25]
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v34, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v35, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
-; GFX940-NEXT:    v_min_f64 v[50:51], v[10:11], v[26:27]
-; GFX940-NEXT:    v_min_f64 v[52:53], v[12:13], v[28:29]
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v36, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v5, v37, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[16:17], v[14:15], v[30:31]
-; GFX940-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[24:25]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v8, v48, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v9, v49, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v10, v50, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v11, v51, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v12, v52, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v13, v53, v54, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v15, v17, v54, vcc
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v8f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX900-NEXT:    v_min_f64 v[32:33], v[2:3], v[18:19]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
+; GFX900-NEXT:    v_min_f64 v[18:19], v[4:5], v[20:21]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[0:1], v[16:17]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v34, 0x7ff80000
+; GFX900-NEXT:    v_min_f64 v[20:21], v[6:7], v[22:23]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23]
+; GFX900-NEXT:    v_min_f64 v[16:17], v[8:9], v[24:25]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25]
+; GFX900-NEXT:    v_min_f64 v[22:23], v[10:11], v[26:27]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27]
+; GFX900-NEXT:    v_min_f64 v[24:25], v[12:13], v[28:29]
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29]
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v3, v34, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v32, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v33, v34, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v18, 0, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v19, v34, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v20, 0, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v21, v34, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v8, v16, 0, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v9, v17, v34, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v22, 0, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v23, v34, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v24, 0, s[14:15]
+; GFX900-NEXT:    v_cndmask_b32_e64 v13, v25, v34, s[14:15]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_min_f64 v[18:19], v[14:15], v[30:31]
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v18, 0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v19, v34, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v8f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_mov_b32_e32 v54, 0x7ff80000
+; GFX950-NEXT:    v_min_f64 v[32:33], v[0:1], v[16:17]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[16:17]
+; GFX950-NEXT:    v_min_f64 v[34:35], v[2:3], v[18:19]
+; GFX950-NEXT:    v_min_f64 v[36:37], v[4:5], v[20:21]
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v32, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v33, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[18:19]
+; GFX950-NEXT:    v_min_f64 v[38:39], v[6:7], v[22:23]
+; GFX950-NEXT:    v_min_f64 v[48:49], v[8:9], v[24:25]
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v34, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v35, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[20:21]
+; GFX950-NEXT:    v_min_f64 v[50:51], v[10:11], v[26:27]
+; GFX950-NEXT:    v_min_f64 v[52:53], v[12:13], v[28:29]
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v36, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v37, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[22:23]
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[16:17], v[14:15], v[30:31]
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v38, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v39, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[24:25]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v48, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v49, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[26:27]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v50, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v51, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[28:29]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v52, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v53, v54, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[30:31]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v16, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v17, v54, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v8f64:
 ; GFX10:       ; %bb.0:
@@ -2332,295 +2273,295 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v16f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_writelane_b32 v34, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v34, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v34, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v34, s35, 3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[4:5], v[4:5], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[6:7]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[6:7], v[6:7], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[8:9]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[8:9], v[8:9], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, 0, s[10:11]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[10:11], v[10:11], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, 0, s[12:13]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[12:13], v[12:13], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, 0, s[14:15]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[14:15], v[14:15], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, 0, s[16:17]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[16:17], v[16:17], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, 0, s[18:19]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[18:19], v[18:19], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    v_cndmask_b32_e64 v18, v18, 0, s[20:21]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[20:21], v[20:21], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, 0, s[22:23]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[22:23], v[22:23], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[24:25]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[24:25], v[24:25], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    v_cndmask_b32_e64 v24, v24, 0, s[26:27]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[26:27], v[26:27], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
-; GFX9-NEXT:    v_min_f64 v[28:29], v[28:29], v[31:32]
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
-; GFX9-NEXT:    v_min_f64 v[30:31], v[30:31], v[32:33]
-; GFX9-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v32, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v32, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v32, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v32, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v32, s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v32, s[16:17]
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v17, v32, s[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, v19, v32, s[20:21]
-; GFX9-NEXT:    v_cndmask_b32_e64 v21, v21, v32, s[22:23]
-; GFX9-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
-; GFX9-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
-; GFX9-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
-; GFX9-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
-; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
-; GFX9-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s35, v34, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v34, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v34, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v34, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: v_minimum_v16f64:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_accvgpr_write_b32 a1, v40 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a2, v41 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a3, v42 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a4, v43 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a5, v44 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a6, v45 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a7, v46 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a8, v47 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a9, v56 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a10, v57 ; Reload Reuse
-; GFX940-NEXT:    scratch_load_dword v37, off, s32 offset:16
-; GFX940-NEXT:    scratch_load_dword v36, off, s32 offset:12
-; GFX940-NEXT:    scratch_load_dword v39, off, s32 offset:24
-; GFX940-NEXT:    scratch_load_dword v38, off, s32 offset:20
-; GFX940-NEXT:    scratch_load_dword v49, off, s32 offset:32
-; GFX940-NEXT:    scratch_load_dword v48, off, s32 offset:28
-; GFX940-NEXT:    scratch_load_dword v57, off, s32 offset:8
-; GFX940-NEXT:    scratch_load_dword v56, off, s32 offset:4
-; GFX940-NEXT:    scratch_load_dword v47, off, s32 offset:40
-; GFX940-NEXT:    scratch_load_dword v46, off, s32 offset:36
-; GFX940-NEXT:    scratch_load_dword v45, off, s32 offset:48
-; GFX940-NEXT:    scratch_load_dword v44, off, s32 offset:44
-; GFX940-NEXT:    scratch_load_dword v43, off, s32 offset:56
-; GFX940-NEXT:    scratch_load_dword v42, off, s32 offset:52
-; GFX940-NEXT:    scratch_load_dword v41, off, s32 offset:64
-; GFX940-NEXT:    scratch_load_dword v40, off, s32 offset:60
-; GFX940-NEXT:    scratch_load_dword v55, off, s32 offset:72
-; GFX940-NEXT:    scratch_load_dword v54, off, s32 offset:68
-; GFX940-NEXT:    scratch_load_dword v53, off, s32 offset:80
-; GFX940-NEXT:    scratch_load_dword v52, off, s32 offset:76
-; GFX940-NEXT:    scratch_load_dword v51, off, s32 offset:88
-; GFX940-NEXT:    scratch_load_dword v50, off, s32 offset:84
-; GFX940-NEXT:    scratch_load_dword v35, off, s32 offset:96
-; GFX940-NEXT:    scratch_load_dword v34, off, s32 offset:92
-; GFX940-NEXT:    scratch_load_dword v31, off, s32
-; GFX940-NEXT:    scratch_load_dword v33, off, s32 offset:104
-; GFX940-NEXT:    scratch_load_dword v32, off, s32 offset:100
-; GFX940-NEXT:    v_accvgpr_write_b32 a11, v58 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a12, v59 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a13, v60 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a14, v61 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a15, v62 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_write_b32 a16, v63 ; Reload Reuse
-; GFX940-NEXT:    s_waitcnt vmcnt(25)
-; GFX940-NEXT:    v_min_f64 v[58:59], v[2:3], v[36:37]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[36:37]
-; GFX940-NEXT:    scratch_load_dword v37, off, s32 offset:112
-; GFX940-NEXT:    scratch_load_dword v36, off, s32 offset:108
-; GFX940-NEXT:    s_waitcnt vmcnt(25)
-; GFX940-NEXT:    v_min_f64 v[60:61], v[4:5], v[38:39]
-; GFX940-NEXT:    v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39]
-; GFX940-NEXT:    scratch_load_dword v39, off, s32 offset:120
-; GFX940-NEXT:    scratch_load_dword v38, off, s32 offset:116
-; GFX940-NEXT:    s_waitcnt vmcnt(25)
-; GFX940-NEXT:    v_min_f64 v[62:63], v[6:7], v[48:49]
-; GFX940-NEXT:    v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49]
-; GFX940-NEXT:    scratch_load_dword v49, off, s32 offset:128
-; GFX940-NEXT:    scratch_load_dword v48, off, s32 offset:124
-; GFX940-NEXT:    s_waitcnt vmcnt(25)
-; GFX940-NEXT:    v_min_f64 v[2:3], v[0:1], v[56:57]
-; GFX940-NEXT:    v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57]
-; GFX940-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
-; GFX940-NEXT:    s_waitcnt vmcnt(23)
-; GFX940-NEXT:    v_min_f64 v[56:57], v[8:9], v[46:47]
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s[4:5]
-; GFX940-NEXT:    v_accvgpr_write_b32 a0, v1
-; GFX940-NEXT:    v_cndmask_b32_e64 v1, v3, v0, s[4:5]
-; GFX940-NEXT:    v_cndmask_b32_e64 v2, v58, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v3, v59, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[46:47]
-; GFX940-NEXT:    s_waitcnt vmcnt(21)
-; GFX940-NEXT:    v_min_f64 v[46:47], v[10:11], v[44:45]
-; GFX940-NEXT:    v_cndmask_b32_e64 v4, v60, 0, s[0:1]
-; GFX940-NEXT:    v_cndmask_b32_e64 v8, v56, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v9, v57, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[44:45]
-; GFX940-NEXT:    s_waitcnt vmcnt(19)
-; GFX940-NEXT:    v_min_f64 v[44:45], v[12:13], v[42:43]
-; GFX940-NEXT:    v_cndmask_b32_e64 v5, v61, v0, s[0:1]
-; GFX940-NEXT:    v_cndmask_b32_e64 v10, v46, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v11, v47, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[42:43]
-; GFX940-NEXT:    s_waitcnt vmcnt(17)
-; GFX940-NEXT:    v_min_f64 v[42:43], v[14:15], v[40:41]
-; GFX940-NEXT:    v_cndmask_b32_e64 v6, v62, 0, s[2:3]
-; GFX940-NEXT:    v_cndmask_b32_e64 v12, v44, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v13, v45, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
-; GFX940-NEXT:    s_waitcnt vmcnt(15)
-; GFX940-NEXT:    v_min_f64 v[40:41], v[16:17], v[54:55]
-; GFX940-NEXT:    v_cndmask_b32_e64 v7, v63, v0, s[2:3]
-; GFX940-NEXT:    v_cndmask_b32_e64 v14, v42, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v15, v43, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[54:55]
-; GFX940-NEXT:    s_waitcnt vmcnt(13)
-; GFX940-NEXT:    v_min_f64 v[54:55], v[18:19], v[52:53]
-; GFX940-NEXT:    v_accvgpr_read_b32 v63, a16 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v16, v40, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v17, v41, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[52:53]
-; GFX940-NEXT:    s_waitcnt vmcnt(11)
-; GFX940-NEXT:    v_min_f64 v[52:53], v[20:21], v[50:51]
-; GFX940-NEXT:    v_accvgpr_read_b32 v62, a15 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v18, v54, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v19, v55, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[50:51]
-; GFX940-NEXT:    s_waitcnt vmcnt(9)
-; GFX940-NEXT:    v_min_f64 v[50:51], v[22:23], v[34:35]
-; GFX940-NEXT:    v_accvgpr_read_b32 v61, a14 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v20, v52, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v21, v53, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[34:35]
-; GFX940-NEXT:    s_waitcnt vmcnt(6)
-; GFX940-NEXT:    v_min_f64 v[34:35], v[24:25], v[32:33]
-; GFX940-NEXT:    v_accvgpr_read_b32 v60, a13 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v22, v50, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v23, v51, v0, vcc
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[32:33]
-; GFX940-NEXT:    v_accvgpr_read_b32 v59, a12 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v58, a11 ; Reload Reuse
-; GFX940-NEXT:    v_cndmask_b32_e64 v24, v34, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v25, v35, v0, vcc
-; GFX940-NEXT:    v_accvgpr_read_b32 v57, a10 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v56, a9 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v47, a8 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v46, a7 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v45, a6 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v44, a5 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v43, a4 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v42, a3 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v41, a2 ; Reload Reuse
-; GFX940-NEXT:    v_accvgpr_read_b32 v40, a1 ; Reload Reuse
-; GFX940-NEXT:    s_waitcnt vmcnt(4)
-; GFX940-NEXT:    v_min_f64 v[32:33], v[26:27], v[36:37]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[36:37]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v26, v32, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v27, v33, v0, vcc
-; GFX940-NEXT:    s_waitcnt vmcnt(2)
-; GFX940-NEXT:    v_min_f64 v[32:33], v[28:29], v[38:39]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[38:39]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v28, v32, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v29, v33, v0, vcc
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_min_f64 v[32:33], v[30:31], v[48:49]
-; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[48:49]
-; GFX940-NEXT:    s_nop 1
-; GFX940-NEXT:    v_cndmask_b32_e64 v30, v32, 0, vcc
-; GFX940-NEXT:    v_cndmask_b32_e32 v31, v33, v0, vcc
-; GFX940-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v16f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    v_writelane_b32 v34, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v34, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v34, s34, 2
+; GFX900-NEXT:    v_writelane_b32 v34, s35, 3
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[0:1], v[0:1], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[2:3], v[2:3], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[4:5], v[4:5], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[6:7]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[6:7], v[6:7], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[8:9]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[8:9], v[8:9], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX900-NEXT:    v_cndmask_b32_e64 v8, v8, 0, s[10:11]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[10:11], v[10:11], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v10, 0, s[12:13]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[12:13], v[12:13], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v12, 0, s[14:15]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[14:15], v[14:15], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v14, 0, s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[16:17], v[16:17], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX900-NEXT:    v_cndmask_b32_e64 v16, v16, 0, s[18:19]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[18:19], v[18:19], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX900-NEXT:    v_cndmask_b32_e64 v18, v18, 0, s[20:21]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[20:21], v[20:21], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX900-NEXT:    v_cndmask_b32_e64 v20, v20, 0, s[22:23]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[22:23], v[22:23], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX900-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[24:25]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[24:25], v[24:25], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX900-NEXT:    v_cndmask_b32_e64 v24, v24, 0, s[26:27]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[26:27], v[26:27], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX900-NEXT:    v_cndmask_b32_e64 v26, v26, 0, s[28:29]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32]
+; GFX900-NEXT:    v_min_f64 v[28:29], v[28:29], v[31:32]
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX900-NEXT:    v_cndmask_b32_e64 v28, v28, 0, s[30:31]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33]
+; GFX900-NEXT:    v_min_f64 v[30:31], v[30:31], v[32:33]
+; GFX900-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v32, vcc
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v3, v32, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v5, v32, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v7, v32, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v9, v9, v32, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v11, v32, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[14:15]
+; GFX900-NEXT:    v_cndmask_b32_e64 v15, v15, v32, s[16:17]
+; GFX900-NEXT:    v_cndmask_b32_e64 v17, v17, v32, s[18:19]
+; GFX900-NEXT:    v_cndmask_b32_e64 v19, v19, v32, s[20:21]
+; GFX900-NEXT:    v_cndmask_b32_e64 v21, v21, v32, s[22:23]
+; GFX900-NEXT:    v_cndmask_b32_e64 v23, v23, v32, s[24:25]
+; GFX900-NEXT:    v_cndmask_b32_e64 v25, v25, v32, s[26:27]
+; GFX900-NEXT:    v_cndmask_b32_e64 v27, v27, v32, s[28:29]
+; GFX900-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[34:35]
+; GFX900-NEXT:    v_cndmask_b32_e64 v30, v30, 0, s[34:35]
+; GFX900-NEXT:    v_readlane_b32 s35, v34, 3
+; GFX900-NEXT:    v_readlane_b32 s34, v34, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v34, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v34, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v16f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v40 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a2, v41 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v42 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a4, v43 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a5, v44 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v45 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a8, v47 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a9, v56 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a10, v57 ; Reload Reuse
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:16
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:12
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:24
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:32
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:28
+; GFX950-NEXT:    scratch_load_dword v57, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v56, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v47, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v46, off, s32 offset:36
+; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:44
+; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:52
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:60
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:72
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:68
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:80
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:76
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:88
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:84
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:96
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:104
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:100
+; GFX950-NEXT:    v_accvgpr_write_b32 a11, v58 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a12, v59 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a13, v60 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a14, v61 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a15, v62 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a16, v63 ; Reload Reuse
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_min_f64 v[58:59], v[2:3], v[36:37]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[36:37]
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:112
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:108
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_min_f64 v[60:61], v[4:5], v[38:39]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39]
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:120
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:116
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_min_f64 v[62:63], v[6:7], v[48:49]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49]
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:128
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:124
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_min_f64 v[2:3], v[0:1], v[56:57]
+; GFX950-NEXT:    v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57]
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_min_f64 v[56:57], v[8:9], v[46:47]
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s[4:5]
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v3, v0, s[4:5]
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v58, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v59, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[46:47]
+; GFX950-NEXT:    s_waitcnt vmcnt(21)
+; GFX950-NEXT:    v_min_f64 v[46:47], v[10:11], v[44:45]
+; GFX950-NEXT:    v_cndmask_b32_e64 v4, v60, 0, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e64 v8, v56, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v57, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[44:45]
+; GFX950-NEXT:    s_waitcnt vmcnt(19)
+; GFX950-NEXT:    v_min_f64 v[44:45], v[12:13], v[42:43]
+; GFX950-NEXT:    v_cndmask_b32_e64 v5, v61, v0, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e64 v10, v46, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v47, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[42:43]
+; GFX950-NEXT:    s_waitcnt vmcnt(17)
+; GFX950-NEXT:    v_min_f64 v[42:43], v[14:15], v[40:41]
+; GFX950-NEXT:    v_cndmask_b32_e64 v6, v62, 0, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v12, v44, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v45, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[40:41]
+; GFX950-NEXT:    s_waitcnt vmcnt(15)
+; GFX950-NEXT:    v_min_f64 v[40:41], v[16:17], v[54:55]
+; GFX950-NEXT:    v_cndmask_b32_e64 v7, v63, v0, s[2:3]
+; GFX950-NEXT:    v_cndmask_b32_e64 v14, v42, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v43, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[54:55]
+; GFX950-NEXT:    s_waitcnt vmcnt(13)
+; GFX950-NEXT:    v_min_f64 v[54:55], v[18:19], v[52:53]
+; GFX950-NEXT:    v_accvgpr_read_b32 v63, a16 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v16, v40, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v41, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[52:53]
+; GFX950-NEXT:    s_waitcnt vmcnt(11)
+; GFX950-NEXT:    v_min_f64 v[52:53], v[20:21], v[50:51]
+; GFX950-NEXT:    v_accvgpr_read_b32 v62, a15 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v18, v54, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v55, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[50:51]
+; GFX950-NEXT:    s_waitcnt vmcnt(9)
+; GFX950-NEXT:    v_min_f64 v[50:51], v[22:23], v[34:35]
+; GFX950-NEXT:    v_accvgpr_read_b32 v61, a14 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v20, v52, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v53, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[34:35]
+; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    v_min_f64 v[34:35], v[24:25], v[32:33]
+; GFX950-NEXT:    v_accvgpr_read_b32 v60, a13 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v22, v50, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v51, v0, vcc
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[32:33]
+; GFX950-NEXT:    v_accvgpr_read_b32 v59, a12 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v58, a11 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e64 v24, v34, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v35, v0, vcc
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v47, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v46, a7 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a6 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v41, a2 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a1 ; Reload Reuse
+; GFX950-NEXT:    s_waitcnt vmcnt(4)
+; GFX950-NEXT:    v_min_f64 v[32:33], v[26:27], v[36:37]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[36:37]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v26, v32, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v33, v0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
+; GFX950-NEXT:    v_min_f64 v[32:33], v[28:29], v[38:39]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[38:39]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v28, v32, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v29, v33, v0, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_min_f64 v[32:33], v[30:31], v[48:49]
+; GFX950-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[48:49]
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v30, v32, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v31, v33, v0, vcc
+; GFX950-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v16f64:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
index 003c3ea7fce10..856cf61127849 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
@@ -25,47 +25,44 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) {
   ; GFX11-NEXT:   [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3, [[COPY3]], %subreg.sub4, [[COPY2]], %subreg.sub5, [[COPY1]], %subreg.sub6, [[COPY]], %subreg.sub7
-  ; GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GFX11-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GFX11-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX11-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.1:
   ; GFX11-NEXT:   successors: %bb.2(0x80000000)
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
-  ; GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
-  ; GFX11-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+  ; GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GFX11-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
-  ; GFX11-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
-  ; GFX11-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+  ; GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+  ; GFX11-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
   ; GFX11-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub4, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub5, implicit $exec
-  ; GFX11-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
-  ; GFX11-NEXT:   [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec
+  ; GFX11-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
+  ; GFX11-NEXT:   [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[REG_SEQUENCE]].sub4_sub5, implicit $exec
   ; GFX11-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U64_e64_2]], implicit-def $scc
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub6, implicit $exec
   ; GFX11-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub7, implicit $exec
-  ; GFX11-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
-  ; GFX11-NEXT:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE5]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec
+  ; GFX11-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
+  ; GFX11-NEXT:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE]].sub6_sub7, implicit $exec
   ; GFX11-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_1]], [[V_CMP_EQ_U64_e64_3]], implicit-def $scc
-  ; GFX11-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3, [[V_READFIRSTLANE_B32_4]], %subreg.sub4, [[V_READFIRSTLANE_B32_5]], %subreg.sub5, [[V_READFIRSTLANE_B32_6]], %subreg.sub6, [[V_READFIRSTLANE_B32_7]], %subreg.sub7
+  ; GFX11-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3, [[V_READFIRSTLANE_B32_4]], %subreg.sub4, [[V_READFIRSTLANE_B32_5]], %subreg.sub5, [[V_READFIRSTLANE_B32_6]], %subreg.sub6, [[V_READFIRSTLANE_B32_7]], %subreg.sub7
   ; GFX11-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.2:
   ; GFX11-NEXT:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   [[IMAGE_LOAD_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 [[REG_SEQUENCE1]], killed [[REG_SEQUENCE6]], 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; GFX11-NEXT:   [[IMAGE_LOAD_V1_V2_nsa_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_LOAD_V1_V2_nsa_gfx11 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; GFX11-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
   ; GFX11-NEXT:   SI_WATERFALL_LOOP %bb.1, implicit $exec
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.3:
-  ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_1]]
-  ; GFX11-NEXT:   $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_gfx11_]]
+  ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+  ; GFX11-NEXT:   $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_nsa_gfx11_]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   ;
   ; GFX12-LABEL: name: vimage_move_to_valu
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
index fa62048fd31ad..bb248fe0444db 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
@@ -67,16 +67,24 @@ body:             |
     liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
 
     ; GCN-LABEL: name: preserve_all_lanes_wwm_above_args
-    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
+    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
+    ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
+    ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
+    ; GCN-NEXT: $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
-    ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
-    ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 10, implicit $exec
-    ; GCN-NEXT: $vgpr8 = COPY killed $vgpr0
+    ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0
+    ; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 10, implicit $exec
+    ; GCN-NEXT: $vgpr8 = COPY killed $vgpr10
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+    ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
+    S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
     $sgpr35 = S_MOV_B32 5
     $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0
@@ -104,10 +112,12 @@ body:             |
     ; GCN-LABEL: name: dont_preserve_args
     ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
+    S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
     renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
@@ -131,15 +141,23 @@ body:             |
     liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
 
     ; GCN-LABEL: name: preserve_inactive_lanes_wwm_args
-    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr10
+    ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
     ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
+    ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
     ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr0
+    ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
+    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
+    S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     $sgpr35 = S_MOV_B32 5
     $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
@@ -168,6 +186,7 @@ body:             |
     ; GCN-LABEL: name: dont_preserve_if_no_chain_calls
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
@@ -175,6 +194,7 @@ body:             |
     ; GCN-NEXT: $vgpr9 = V_MOV_B32_e32 20, implicit $exec
     ; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 30, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
+    S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     $sgpr35 = S_MOV_B32 5
     $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
index 49001a2cfd7a6..4aea915936ffc 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
@@ -8,7 +8,6 @@
   declare amdgpu_gfx void @gfx_callee()
 
   define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void}
-  define amdgpu_cs_chain void @preserve_inactive_detected_wwm() {ret void}
   define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void}
   define amdgpu_cs_chain void @dont_preserve_wwm_if_init_whole_wave() {ret void}
   define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void}
@@ -36,55 +35,23 @@ body:             |
     liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
 
     ; GCN-LABEL: name: preserve_inactive_wwm
-    ; GCN: liveins: $sgpr0, $sgpr35
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-    ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
-    renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
-    renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-    SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
-
-...
-
-# Check that it also works for SGPR to VGPR spills.
-
----
-name:            preserve_inactive_detected_wwm
-tracksRegLiveness: true
-frameInfo:
-  hasTailCall:     true
-machineFunctionInfo:
-  stackPtrOffsetReg: '$sgpr32'
-  returnsVoid:     true
-body:             |
-  bb.0:
-    liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
-
-    ; GCN-LABEL: name: preserve_inactive_detected_wwm
     ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
-    ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
-    ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
-    ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
-    ; GCN-NEXT: $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
-    ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
-    ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0
-    ; GCN-NEXT: renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec
+    ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+    ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
+    ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_ST 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
     ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
-    renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
-    $sgpr35 = S_MOV_B32 5
-    $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
-    renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
-    renamable $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
-    $sgpr35 = S_MOV_B32 5
-    $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0
-    renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec
     renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
     renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
+    S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
 
 ...
@@ -110,11 +77,13 @@ body:             |
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
     ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
     ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
+    ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     ; GCN-NEXT: S_ENDPGM 0
     renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
     $sgpr35 = S_MOV_B32 5
     $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
     renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
+    S_NOP 0, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
     S_ENDPGM 0
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index b87439a9d6fae..c611c4b502817 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -357,7 +357,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
   ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 2d3b34e9bddec..3a49c9b23f59e 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -635,7 +635,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
   ; SI-NEXT: bb.5:
   ; SI-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %29:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; SI-NEXT:   [[IMAGE_SAMPLE_V1_V2_nsa_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_nsa_gfx10 undef %29:vgpr_32, undef %31:vgpr_32, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc
   ; SI-NEXT:   SI_WATERFALL_LOOP %bb.4, implicit $exec
   ; SI-NEXT: {{  $}}
@@ -648,7 +648,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.7:
   ; SI-NEXT:   $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]]
-  ; SI-NEXT:   GLOBAL_STORE_DWORD undef %32:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; SI-NEXT:   GLOBAL_STORE_DWORD undef %34:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_nsa_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; SI-NEXT:   S_ENDPGM 0
 entry:
   %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index ab84c0c905771..d18a2288ef244 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2205,8 +2205,8 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB39_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
+; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB39_3
 ; GFX10-W32-NEXT:    s_branch .LBB39_4
diff --git a/llvm/test/CodeGen/ARM/fcmp-xo.ll b/llvm/test/CodeGen/ARM/fcmp-xo.ll
index 3d5972f065859..908dbd7a11a6b 100644
--- a/llvm/test/CodeGen/ARM/fcmp-xo.ll
+++ b/llvm/test/CodeGen/ARM/fcmp-xo.ll
@@ -54,12 +54,12 @@ define arm_aapcs_vfpcc float @float128(float %a0) local_unnamed_addr {
 ; NEON-LABEL: float128:
 ; NEON:       @ %bb.0:
 ; NEON-NEXT:    mov.w r0, #1124073472
-; NEON-NEXT:    vmov.f32 s2, #5.000000e-01
-; NEON-NEXT:    vmov d3, r0, r0
-; NEON-NEXT:    vmov.f32 s4, #-5.000000e-01
-; NEON-NEXT:    vcmp.f32 s6, s0
+; NEON-NEXT:    vmov.f32 s4, #5.000000e-01
+; NEON-NEXT:    vmov d1, r0, r0
+; NEON-NEXT:    vmov.f32 s6, #-5.000000e-01
+; NEON-NEXT:    vcmp.f32 s2, s0
 ; NEON-NEXT:    vmrs APSR_nzcv, fpscr
-; NEON-NEXT:    vselgt.f32 s0, s4, s2
+; NEON-NEXT:    vselgt.f32 s0, s6, s4
 ; NEON-NEXT:    bx lr
   %1 = fcmp nsz olt float %a0, 128.000000e+00
   %2 = select i1 %1, float -5.000000e-01, float 5.000000e-01
diff --git a/llvm/test/CodeGen/ARM/fp16-instructions.ll b/llvm/test/CodeGen/ARM/fp16-instructions.ll
index 1988cb1d2f903..7a1d5ddfa301b 100644
--- a/llvm/test/CodeGen/ARM/fp16-instructions.ll
+++ b/llvm/test/CodeGen/ARM/fp16-instructions.ll
@@ -700,9 +700,9 @@ define half @select_cc1(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc1:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vseleq.f16 s0,
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vseleq.f16 s0,
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -728,9 +728,9 @@ define half @select_cc_ge1(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc_ge1:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vselge.f16 s0,
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vselge.f16 s0,
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -751,9 +751,9 @@ define half @select_cc_ge2(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc_ge2:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vselge.f16 s0,
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vselge.f16 s0,
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -774,9 +774,9 @@ define half @select_cc_ge3(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc_ge3:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vselge.f16 s0,
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vselge.f16 s0,
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -797,9 +797,9 @@ define half @select_cc_ge4(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc_ge4:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vselge.f16 s0, s{{.}}, s{{.}}
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vselge.f16 s0, s{{.}}, s{{.}}
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -821,9 +821,9 @@ define half @select_cc_gt1(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc_gt1:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vselgt.f16  s0, s{{.}}, s{{.}}
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vselgt.f16  s0, s{{.}}, s{{.}}
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -844,9 +844,9 @@ define half @select_cc_gt2(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc_gt2:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs  APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vselgt.f16  s0, s{{.}}, s{{.}}
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs  APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vselgt.f16  s0, s{{.}}, s{{.}}
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -867,9 +867,9 @@ define half @select_cc_gt3(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc_gt3:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs  APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vselgt.f16  s0, s{{.}}, s{{.}}
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs  APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vselgt.f16  s0, s{{.}}, s{{.}}
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -890,9 +890,9 @@ define half @select_cc_gt4(ptr %a0)  {
 
 ; CHECK-LABEL:                 select_cc_gt4:
 
-; CHECK-HARDFP-FULLFP16:       vcmp.f16
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs  APSR_nzcv, fpscr
-; CHECK-HARDFP-FULLFP16-NEXT:  vselgt.f16  s0, s{{.}}, s{{.}}
+; CHECK-HARDFP-FULLFP16:  vcmp.f16
+; CHECK-HARDFP-FULLFP16:  vmrs  APSR_nzcv, fpscr
+; CHECK-HARDFP-FULLFP16:  vselgt.f16  s0, s{{.}}, s{{.}}
 
 ; CHECK-SOFTFP-FP16-A32:       vcmp.f32
 ; CHECK-SOFTFP-FP16-A32-NEXT:  vmrs APSR_nzcv, fpscr
@@ -923,10 +923,10 @@ entry:
 ; CHECK-LABEL:                 select_cc4:
 
 ; CHECK-HARDFP-FULLFP16:       vldr.16	[[S2:s[0-9]]], .LCPI{{.*}}
+; CHECK-HARDFP-FULLFP16:       vcmp.f16	s0, [[S2]]
 ; CHECK-HARDFP-FULLFP16:       vldr.16	[[S4:s[0-9]]], .LCPI{{.*}}
+; CHECK-HARDFP-FULLFP16:       vmrs	APSR_nzcv, fpscr
 ; CHECK-HARDFP-FULLFP16:       vmov.f16 [[S6:s[0-9]]], #-2.000000e+00
-; CHECK-HARDFP-FULLFP16:       vcmp.f16	s0, [[S2]]
-; CHECK-HARDFP-FULLFP16-NEXT:  vmrs	APSR_nzcv, fpscr
 ; CHECK-HARDFP-FULLFP16-NEXT:  vseleq.f16	[[S0:s[0-9]]], [[S6]], [[S4]]
 ; CHECK-HARDFP-FULLFP16-NEXT:  vselvs.f16	s0, [[S6]], [[S0]]
 
diff --git a/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll b/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll
index 56e734c440433..996b46c51ab36 100644
--- a/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vminmaxnm-safe.ll
@@ -5,11 +5,11 @@
 define half @fp16_vminnm_o(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_o:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
-; CHECK-NEXT:    vmov.f16 s2, r0
-; CHECK-NEXT:    vcmp.f16 s0, s2
+; CHECK-NEXT:    vmov.f16 s0, r0
+; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vselgt.f16 s0, s2, s0
+; CHECK-NEXT:    vselgt.f16 s0, s0, s2
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -37,11 +37,11 @@ entry:
 define half @fp16_vminnm_u(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_u:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r0
-; CHECK-NEXT:    vmov.f16 s2, r1
-; CHECK-NEXT:    vcmp.f16 s0, s2
+; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
+; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vselge.f16 s0, s2, s0
+; CHECK-NEXT:    vselge.f16 s0, s0, s2
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -53,11 +53,11 @@ entry:
 define half @fp16_vminnm_ule(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_ule:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r0
-; CHECK-NEXT:    vmov.f16 s2, r1
-; CHECK-NEXT:    vcmp.f16 s0, s2
+; CHECK-NEXT:    vmov.f16 s0, r1
+; CHECK-NEXT:    vmov.f16 s2, r0
+; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vselgt.f16 s0, s2, s0
+; CHECK-NEXT:    vselgt.f16 s0, s0, s2
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -69,11 +69,11 @@ entry:
 define half @fp16_vminnm_u_rev(half %a, half %b) {
 ; CHECK-LABEL: fp16_vminnm_u_rev:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f16 s0, r1
-; CHECK-NEXT:    vmov.f16 s2, r0
-; CHECK-NEXT:    vcmp.f16 s0, s2
+; CHECK-NEXT:    vmov.f16 s0, r0
+; CHECK-NEXT:    vmov.f16 s2, r1
+; CHECK-NEXT:    vcmp.f16 s2, s0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vselge.f16 s0, s2, s0
+; CHECK-NEXT:    vselge.f16 s0, s0, s2
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll b/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll
index 4b27e804e6df9..84f6ee276ba5f 100644
--- a/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll
@@ -258,11 +258,11 @@ define i13 @test_signed_i13_f32(float %f) nounwind {
 ; VFP2:       @ %bb.0:
 ; VFP2-NEXT:    vmov s0, r0
 ; VFP2-NEXT:    vldr s2, .LCPI2_0
+; VFP2-NEXT:    vldr s6, .LCPI2_1
 ; VFP2-NEXT:    vcvt.s32.f32 s4, s0
 ; VFP2-NEXT:    vcmp.f32 s0, s2
-; VFP2-NEXT:    vldr s2, .LCPI2_1
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s0, s2
+; VFP2-NEXT:    vcmp.f32 s0, s6
 ; VFP2-NEXT:    vmov r0, s4
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movwlt r0, #61440
@@ -358,11 +358,11 @@ define i16 @test_signed_i16_f32(float %f) nounwind {
 ; VFP2:       @ %bb.0:
 ; VFP2-NEXT:    vmov s0, r0
 ; VFP2-NEXT:    vldr s2, .LCPI3_0
+; VFP2-NEXT:    vldr s6, .LCPI3_1
 ; VFP2-NEXT:    vcvt.s32.f32 s4, s0
 ; VFP2-NEXT:    vcmp.f32 s0, s2
-; VFP2-NEXT:    vldr s2, .LCPI3_1
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s0, s2
+; VFP2-NEXT:    vcmp.f32 s0, s6
 ; VFP2-NEXT:    vmov r0, s4
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movwlt r0, #32768
@@ -458,11 +458,11 @@ define i19 @test_signed_i19_f32(float %f) nounwind {
 ; VFP2:       @ %bb.0:
 ; VFP2-NEXT:    vmov s0, r0
 ; VFP2-NEXT:    vldr s2, .LCPI4_0
+; VFP2-NEXT:    vldr s6, .LCPI4_1
 ; VFP2-NEXT:    vcvt.s32.f32 s4, s0
 ; VFP2-NEXT:    vcmp.f32 s0, s2
-; VFP2-NEXT:    vldr s2, .LCPI4_1
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s0, s2
+; VFP2-NEXT:    vcmp.f32 s0, s6
 ; VFP2-NEXT:    vmov r0, s4
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movlt r0, #0
@@ -639,39 +639,31 @@ define i50 @test_signed_i50_f32(float %f) nounwind {
 ;
 ; VFP-LABEL: test_signed_i50_f32:
 ; VFP:       @ %bb.0:
-; VFP-NEXT:    .save {r7, lr}
-; VFP-NEXT:    push {r7, lr}
-; VFP-NEXT:    .vsave {d8}
-; VFP-NEXT:    vpush {d8}
-; VFP-NEXT:    vmov s16, r0
+; VFP-NEXT:    .save {r4, lr}
+; VFP-NEXT:    push {r4, lr}
+; VFP-NEXT:    mov r4, r0
 ; VFP-NEXT:    bl __aeabi_f2lz
 ; VFP-NEXT:    vldr s0, .LCPI6_0
-; VFP-NEXT:    vldr s2, .LCPI6_1
-; VFP-NEXT:    vcmp.f32 s16, s0
+; VFP-NEXT:    vmov s2, r4
+; VFP-NEXT:    vldr s4, .LCPI6_1
+; VFP-NEXT:    vcmp.f32 s2, s0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, s2
-; VFP-NEXT:    itt lt
+; VFP-NEXT:    ittt lt
 ; VFP-NEXT:    movlt r1, #0
 ; VFP-NEXT:    movtlt r1, #65534
+; VFP-NEXT:    movlt r0, #0
+; VFP-NEXT:    vcmp.f32 s2, s4
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, s0
-; VFP-NEXT:    itt gt
+; VFP-NEXT:    ittt gt
 ; VFP-NEXT:    movwgt r1, #65535
 ; VFP-NEXT:    movtgt r1, #1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
-; VFP-NEXT:    movlt r0, #0
-; VFP-NEXT:    vcmp.f32 s16, s2
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it gt
 ; VFP-NEXT:    movgt.w r0, #-1
-; VFP-NEXT:    vcmp.f32 s16, s16
+; VFP-NEXT:    vcmp.f32 s2, s2
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP-NEXT:    itt vs
 ; VFP-NEXT:    movvs r0, #0
 ; VFP-NEXT:    movvs r1, #0
-; VFP-NEXT:    vpop {d8}
-; VFP-NEXT:    pop {r7, pc}
+; VFP-NEXT:    pop {r4, pc}
 ; VFP-NEXT:    .p2align 2
 ; VFP-NEXT:  @ %bb.1:
 ; VFP-NEXT:  .LCPI6_0:
@@ -765,27 +757,18 @@ define i64 @test_signed_i64_f32(float %f) nounwind {
 ; VFP-NEXT:    vldr s4, .LCPI7_1
 ; VFP-NEXT:    vcmp.f32 s2, s0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s4
-; VFP-NEXT:    it lt
+; VFP-NEXT:    itt lt
 ; VFP-NEXT:    movlt r0, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s2
-; VFP-NEXT:    it gt
-; VFP-NEXT:    movgt.w r0, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s0
-; VFP-NEXT:    it vs
-; VFP-NEXT:    movvs r0, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
 ; VFP-NEXT:    movlt.w r1, #-2147483648
 ; VFP-NEXT:    vcmp.f32 s2, s4
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it gt
+; VFP-NEXT:    itt gt
 ; VFP-NEXT:    mvngt r1, #-2147483648
+; VFP-NEXT:    movgt.w r0, #-1
 ; VFP-NEXT:    vcmp.f32 s2, s2
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it vs
+; VFP-NEXT:    itt vs
+; VFP-NEXT:    movvs r0, #0
 ; VFP-NEXT:    movvs r1, #0
 ; VFP-NEXT:    pop {r4, pc}
 ; VFP-NEXT:    .p2align 2
@@ -923,51 +906,24 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ; VFP-NEXT:    vldr s4, .LCPI8_1
 ; VFP-NEXT:    vcmp.f32 s2, s0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s4
-; VFP-NEXT:    it lt
+; VFP-NEXT:    itttt lt
 ; VFP-NEXT:    movlt r0, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s2
-; VFP-NEXT:    it gt
-; VFP-NEXT:    movgt.w r0, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s0
-; VFP-NEXT:    it vs
-; VFP-NEXT:    movvs r0, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s4
-; VFP-NEXT:    it lt
 ; VFP-NEXT:    movlt r1, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s2
-; VFP-NEXT:    it gt
-; VFP-NEXT:    movgt.w r1, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s0
-; VFP-NEXT:    it vs
-; VFP-NEXT:    movvs r1, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s4
-; VFP-NEXT:    it lt
 ; VFP-NEXT:    movlt r2, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s2
-; VFP-NEXT:    it gt
-; VFP-NEXT:    movgt.w r2, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s0
-; VFP-NEXT:    it vs
-; VFP-NEXT:    movvs r2, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
 ; VFP-NEXT:    mvnlt r3, #7
 ; VFP-NEXT:    vcmp.f32 s2, s4
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it gt
+; VFP-NEXT:    itttt gt
 ; VFP-NEXT:    movgt r3, #7
+; VFP-NEXT:    movgt.w r2, #-1
+; VFP-NEXT:    movgt.w r1, #-1
+; VFP-NEXT:    movgt.w r0, #-1
 ; VFP-NEXT:    vcmp.f32 s2, s2
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it vs
+; VFP-NEXT:    itttt vs
+; VFP-NEXT:    movvs r0, #0
+; VFP-NEXT:    movvs r1, #0
+; VFP-NEXT:    movvs r2, #0
 ; VFP-NEXT:    movvs r3, #0
 ; VFP-NEXT:    pop {r4, pc}
 ; VFP-NEXT:    .p2align 2
@@ -1108,51 +1064,24 @@ define i128 @test_signed_i128_f32(float %f) nounwind {
 ; VFP-NEXT:    vldr s4, .LCPI9_1
 ; VFP-NEXT:    vcmp.f32 s2, s0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s4
-; VFP-NEXT:    it lt
+; VFP-NEXT:    itttt lt
 ; VFP-NEXT:    movlt r0, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s2
-; VFP-NEXT:    it gt
-; VFP-NEXT:    movgt.w r0, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s0
-; VFP-NEXT:    it vs
-; VFP-NEXT:    movvs r0, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s4
-; VFP-NEXT:    it lt
 ; VFP-NEXT:    movlt r1, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s2
-; VFP-NEXT:    it gt
-; VFP-NEXT:    movgt.w r1, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s0
-; VFP-NEXT:    it vs
-; VFP-NEXT:    movvs r1, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s4
-; VFP-NEXT:    it lt
 ; VFP-NEXT:    movlt r2, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s2
-; VFP-NEXT:    it gt
-; VFP-NEXT:    movgt.w r2, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s2, s0
-; VFP-NEXT:    it vs
-; VFP-NEXT:    movvs r2, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
 ; VFP-NEXT:    movlt.w r3, #-2147483648
 ; VFP-NEXT:    vcmp.f32 s2, s4
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it gt
+; VFP-NEXT:    itttt gt
 ; VFP-NEXT:    mvngt r3, #-2147483648
+; VFP-NEXT:    movgt.w r2, #-1
+; VFP-NEXT:    movgt.w r1, #-1
+; VFP-NEXT:    movgt.w r0, #-1
 ; VFP-NEXT:    vcmp.f32 s2, s2
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it vs
+; VFP-NEXT:    itttt vs
+; VFP-NEXT:    movvs r0, #0
+; VFP-NEXT:    movvs r1, #0
+; VFP-NEXT:    movvs r2, #0
 ; VFP-NEXT:    movvs r3, #0
 ; VFP-NEXT:    pop {r4, pc}
 ; VFP-NEXT:    .p2align 2
@@ -1451,15 +1380,15 @@ define i13 @test_signed_i13_f64(double %f) nounwind {
 ; VFP2:       @ %bb.0:
 ; VFP2-NEXT:    vmov d16, r0, r1
 ; VFP2-NEXT:    vldr d17, .LCPI12_0
+; VFP2-NEXT:    vldr d18, .LCPI12_1
 ; VFP2-NEXT:    vcvt.s32.f64 s0, d16
 ; VFP2-NEXT:    vcmp.f64 d16, d17
-; VFP2-NEXT:    vldr d17, .LCPI12_1
-; VFP2-NEXT:    vmov r0, s0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
+; VFP2-NEXT:    vmov r0, s0
+; VFP2-NEXT:    vcmp.f64 d16, d18
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movwlt r0, #61440
 ; VFP2-NEXT:    movtlt r0, #65535
-; VFP2-NEXT:    vcmp.f64 d16, d17
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movwgt r0, #4095
@@ -1568,15 +1497,15 @@ define i16 @test_signed_i16_f64(double %f) nounwind {
 ; VFP2:       @ %bb.0:
 ; VFP2-NEXT:    vmov d16, r0, r1
 ; VFP2-NEXT:    vldr d17, .LCPI13_0
+; VFP2-NEXT:    vldr d18, .LCPI13_1
 ; VFP2-NEXT:    vcvt.s32.f64 s0, d16
 ; VFP2-NEXT:    vcmp.f64 d16, d17
-; VFP2-NEXT:    vldr d17, .LCPI13_1
-; VFP2-NEXT:    vmov r0, s0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
+; VFP2-NEXT:    vmov r0, s0
+; VFP2-NEXT:    vcmp.f64 d16, d18
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movwlt r0, #32768
 ; VFP2-NEXT:    movtlt r0, #65535
-; VFP2-NEXT:    vcmp.f64 d16, d17
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movwgt r0, #32767
@@ -1685,15 +1614,15 @@ define i19 @test_signed_i19_f64(double %f) nounwind {
 ; VFP2:       @ %bb.0:
 ; VFP2-NEXT:    vmov d16, r0, r1
 ; VFP2-NEXT:    vldr d17, .LCPI14_0
+; VFP2-NEXT:    vldr d18, .LCPI14_1
 ; VFP2-NEXT:    vcvt.s32.f64 s0, d16
 ; VFP2-NEXT:    vcmp.f64 d16, d17
-; VFP2-NEXT:    vldr d17, .LCPI14_1
-; VFP2-NEXT:    vmov r0, s0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
+; VFP2-NEXT:    vmov r0, s0
+; VFP2-NEXT:    vcmp.f64 d16, d18
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movlt r0, #0
 ; VFP2-NEXT:    movtlt r0, #65532
-; VFP2-NEXT:    vcmp.f64 d16, d17
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP2-NEXT:    itt gt
 ; VFP2-NEXT:    movwgt r0, #65535
@@ -1905,42 +1834,32 @@ define i50 @test_signed_i50_f64(double %f) nounwind {
 ;
 ; VFP2-LABEL: test_signed_i50_f64:
 ; VFP2:       @ %bb.0:
-; VFP2-NEXT:    .save {r7, lr}
-; VFP2-NEXT:    push {r7, lr}
-; VFP2-NEXT:    .vsave {d8}
-; VFP2-NEXT:    vpush {d8}
-; VFP2-NEXT:    vmov d8, r0, r1
+; VFP2-NEXT:    .save {r4, r5, r7, lr}
+; VFP2-NEXT:    push {r4, r5, r7, lr}
+; VFP2-NEXT:    mov r4, r1
+; VFP2-NEXT:    mov r5, r0
 ; VFP2-NEXT:    bl __aeabi_d2lz
 ; VFP2-NEXT:    vldr d16, .LCPI16_0
-; VFP2-NEXT:    vldr d17, .LCPI16_1
-; VFP2-NEXT:    vcmp.f64 d8, d16
+; VFP2-NEXT:    vmov d17, r5, r4
+; VFP2-NEXT:    vldr d18, .LCPI16_1
+; VFP2-NEXT:    vcmp.f64 d17, d16
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    itt lt
+; VFP2-NEXT:    ittt lt
 ; VFP2-NEXT:    movlt r1, #0
 ; VFP2-NEXT:    movtlt r1, #65534
-; VFP2-NEXT:    vcmp.f64 d8, d17
+; VFP2-NEXT:    movlt r0, #0
+; VFP2-NEXT:    vcmp.f64 d17, d18
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    itt gt
+; VFP2-NEXT:    ittt gt
 ; VFP2-NEXT:    movwgt r1, #65535
 ; VFP2-NEXT:    movtgt r1, #1
-; VFP2-NEXT:    vcmp.f64 d8, d8
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r1, #0
-; VFP2-NEXT:    vcmp.f64 d8, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vcmp.f64 d8, d17
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vcmp.f64 d8, d8
+; VFP2-NEXT:    vcmp.f64 d17, d17
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
+; VFP2-NEXT:    itt vs
 ; VFP2-NEXT:    movvs r0, #0
-; VFP2-NEXT:    vpop {d8}
-; VFP2-NEXT:    pop {r7, pc}
+; VFP2-NEXT:    movvs r1, #0
+; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ; VFP2-NEXT:    .p2align 3
 ; VFP2-NEXT:  @ %bb.1:
 ; VFP2-NEXT:  .LCPI16_0:
@@ -2074,27 +1993,18 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; VFP2-NEXT:    vldr d18, .LCPI17_1
 ; VFP2-NEXT:    vcmp.f64 d17, d16
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vcmp.f64 d17, d18
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vcmp.f64 d17, d17
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r0, #0
-; VFP2-NEXT:    vcmp.f64 d17, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt.w r1, #-2147483648
 ; VFP2-NEXT:    vcmp.f64 d17, d18
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itt gt
 ; VFP2-NEXT:    mvngt r1, #-2147483648
+; VFP2-NEXT:    movgt.w r0, #-1
 ; VFP2-NEXT:    vcmp.f64 d17, d17
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
+; VFP2-NEXT:    itt vs
+; VFP2-NEXT:    movvs r0, #0
 ; VFP2-NEXT:    movvs r1, #0
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ; VFP2-NEXT:    .p2align 3
@@ -2118,27 +2028,18 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; FP16-NEXT:    vldr d2, .LCPI17_1
 ; FP16-NEXT:    vcmp.f64 d1, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itt lt
 ; FP16-NEXT:    movlt r0, #0
-; FP16-NEXT:    vcmp.f64 d1, d2
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vcmp.f64 d1, d1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r0, #0
-; FP16-NEXT:    vcmp.f64 d1, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt.w r1, #-2147483648
 ; FP16-NEXT:    vcmp.f64 d1, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itt gt
 ; FP16-NEXT:    mvngt r1, #-2147483648
+; FP16-NEXT:    movgt.w r0, #-1
 ; FP16-NEXT:    vcmp.f64 d1, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
+; FP16-NEXT:    itt vs
+; FP16-NEXT:    movvs r0, #0
 ; FP16-NEXT:    movvs r1, #0
 ; FP16-NEXT:    pop {r4, r5, r7, pc}
 ; FP16-NEXT:    .p2align 3
@@ -2287,51 +2188,24 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; VFP2-NEXT:    vldr d18, .LCPI18_1
 ; VFP2-NEXT:    vcmp.f64 d17, d16
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itttt lt
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vcmp.f64 d17, d18
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vcmp.f64 d17, d17
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r0, #0
-; VFP2-NEXT:    vcmp.f64 d17, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vcmp.f64 d17, d18
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r1, #-1
-; VFP2-NEXT:    vcmp.f64 d17, d17
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r1, #0
-; VFP2-NEXT:    vcmp.f64 d17, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r2, #0
-; VFP2-NEXT:    vcmp.f64 d17, d18
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r2, #-1
-; VFP2-NEXT:    vcmp.f64 d17, d17
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r2, #0
-; VFP2-NEXT:    vcmp.f64 d17, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    mvnlt r3, #7
 ; VFP2-NEXT:    vcmp.f64 d17, d18
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itttt gt
 ; VFP2-NEXT:    movgt r3, #7
+; VFP2-NEXT:    movgt.w r2, #-1
+; VFP2-NEXT:    movgt.w r1, #-1
+; VFP2-NEXT:    movgt.w r0, #-1
 ; VFP2-NEXT:    vcmp.f64 d17, d17
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
+; VFP2-NEXT:    itttt vs
+; VFP2-NEXT:    movvs r0, #0
+; VFP2-NEXT:    movvs r1, #0
+; VFP2-NEXT:    movvs r2, #0
 ; VFP2-NEXT:    movvs r3, #0
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ; VFP2-NEXT:    .p2align 3
@@ -2350,56 +2224,29 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; FP16-NEXT:    mov r4, r1
 ; FP16-NEXT:    mov r5, r0
 ; FP16-NEXT:    bl __fixdfti
-; FP16-NEXT:    vldr d2, .LCPI18_0
-; FP16-NEXT:    vmov d0, r5, r4
-; FP16-NEXT:    vldr d1, .LCPI18_1
-; FP16-NEXT:    vcmp.f64 d0, d2
+; FP16-NEXT:    vldr d0, .LCPI18_0
+; FP16-NEXT:    vmov d1, r5, r4
+; FP16-NEXT:    vldr d2, .LCPI18_1
+; FP16-NEXT:    vcmp.f64 d1, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itttt lt
 ; FP16-NEXT:    movlt r0, #0
-; FP16-NEXT:    vcmp.f64 d0, d1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vcmp.f64 d0, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r0, #0
-; FP16-NEXT:    vcmp.f64 d0, d2
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vcmp.f64 d0, d1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vcmp.f64 d0, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r1, #0
-; FP16-NEXT:    vcmp.f64 d0, d2
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vcmp.f64 d0, d1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vcmp.f64 d0, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r2, #0
-; FP16-NEXT:    vcmp.f64 d0, d2
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    mvnlt r3, #7
-; FP16-NEXT:    vcmp.f64 d0, d1
+; FP16-NEXT:    vcmp.f64 d1, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itttt gt
 ; FP16-NEXT:    movgt r3, #7
-; FP16-NEXT:    vcmp.f64 d0, d0
+; FP16-NEXT:    movgt.w r2, #-1
+; FP16-NEXT:    movgt.w r1, #-1
+; FP16-NEXT:    movgt.w r0, #-1
+; FP16-NEXT:    vcmp.f64 d1, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
+; FP16-NEXT:    itttt vs
+; FP16-NEXT:    movvs r0, #0
+; FP16-NEXT:    movvs r1, #0
+; FP16-NEXT:    movvs r2, #0
 ; FP16-NEXT:    movvs r3, #0
 ; FP16-NEXT:    pop {r4, r5, r7, pc}
 ; FP16-NEXT:    .p2align 3
@@ -2550,51 +2397,24 @@ define i128 @test_signed_i128_f64(double %f) nounwind {
 ; VFP2-NEXT:    vldr d18, .LCPI19_1
 ; VFP2-NEXT:    vcmp.f64 d17, d16
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itttt lt
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vcmp.f64 d17, d18
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vcmp.f64 d17, d17
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r0, #0
-; VFP2-NEXT:    vcmp.f64 d17, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r1, #0
+; VFP2-NEXT:    movlt r2, #0
+; VFP2-NEXT:    movlt.w r3, #-2147483648
 ; VFP2-NEXT:    vcmp.f64 d17, d18
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itttt gt
+; VFP2-NEXT:    mvngt r3, #-2147483648
+; VFP2-NEXT:    movgt.w r2, #-1
 ; VFP2-NEXT:    movgt.w r1, #-1
+; VFP2-NEXT:    movgt.w r0, #-1
 ; VFP2-NEXT:    vcmp.f64 d17, d17
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
+; VFP2-NEXT:    itttt vs
+; VFP2-NEXT:    movvs r0, #0
 ; VFP2-NEXT:    movvs r1, #0
-; VFP2-NEXT:    vcmp.f64 d17, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r2, #0
-; VFP2-NEXT:    vcmp.f64 d17, d18
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r2, #-1
-; VFP2-NEXT:    vcmp.f64 d17, d17
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
 ; VFP2-NEXT:    movvs r2, #0
-; VFP2-NEXT:    vcmp.f64 d17, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt.w r3, #-2147483648
-; VFP2-NEXT:    vcmp.f64 d17, d18
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    mvngt r3, #-2147483648
-; VFP2-NEXT:    vcmp.f64 d17, d17
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
 ; VFP2-NEXT:    movvs r3, #0
 ; VFP2-NEXT:    pop {r4, r5, r7, pc}
 ; VFP2-NEXT:    .p2align 3
@@ -2613,56 +2433,29 @@ define i128 @test_signed_i128_f64(double %f) nounwind {
 ; FP16-NEXT:    mov r4, r1
 ; FP16-NEXT:    mov r5, r0
 ; FP16-NEXT:    bl __fixdfti
-; FP16-NEXT:    vldr d2, .LCPI19_0
-; FP16-NEXT:    vmov d0, r5, r4
-; FP16-NEXT:    vldr d1, .LCPI19_1
-; FP16-NEXT:    vcmp.f64 d0, d2
+; FP16-NEXT:    vldr d0, .LCPI19_0
+; FP16-NEXT:    vmov d1, r5, r4
+; FP16-NEXT:    vldr d2, .LCPI19_1
+; FP16-NEXT:    vcmp.f64 d1, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itttt lt
 ; FP16-NEXT:    movlt r0, #0
-; FP16-NEXT:    vcmp.f64 d0, d1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vcmp.f64 d0, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r0, #0
-; FP16-NEXT:    vcmp.f64 d0, d2
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vcmp.f64 d0, d1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vcmp.f64 d0, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r1, #0
-; FP16-NEXT:    vcmp.f64 d0, d2
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vcmp.f64 d0, d1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vcmp.f64 d0, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r2, #0
-; FP16-NEXT:    vcmp.f64 d0, d2
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt.w r3, #-2147483648
-; FP16-NEXT:    vcmp.f64 d0, d1
+; FP16-NEXT:    vcmp.f64 d1, d2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itttt gt
 ; FP16-NEXT:    mvngt r3, #-2147483648
-; FP16-NEXT:    vcmp.f64 d0, d0
+; FP16-NEXT:    movgt.w r2, #-1
+; FP16-NEXT:    movgt.w r1, #-1
+; FP16-NEXT:    movgt.w r0, #-1
+; FP16-NEXT:    vcmp.f64 d1, d1
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
+; FP16-NEXT:    itttt vs
+; FP16-NEXT:    movvs r0, #0
+; FP16-NEXT:    movvs r1, #0
+; FP16-NEXT:    movvs r2, #0
 ; FP16-NEXT:    movvs r3, #0
 ; FP16-NEXT:    pop {r4, r5, r7, pc}
 ; FP16-NEXT:    .p2align 3
@@ -2949,11 +2742,11 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
 ; VFP2-NEXT:    bl __aeabi_h2f
 ; VFP2-NEXT:    vmov s0, r0
 ; VFP2-NEXT:    vldr s2, .LCPI22_0
+; VFP2-NEXT:    vldr s6, .LCPI22_1
 ; VFP2-NEXT:    vcvt.s32.f32 s4, s0
 ; VFP2-NEXT:    vcmp.f32 s0, s2
-; VFP2-NEXT:    vldr s2, .LCPI22_1
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s0, s2
+; VFP2-NEXT:    vcmp.f32 s0, s6
 ; VFP2-NEXT:    vmov r0, s4
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movwlt r0, #61440
@@ -3055,11 +2848,11 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
 ; VFP2-NEXT:    bl __aeabi_h2f
 ; VFP2-NEXT:    vmov s0, r0
 ; VFP2-NEXT:    vldr s2, .LCPI23_0
+; VFP2-NEXT:    vldr s6, .LCPI23_1
 ; VFP2-NEXT:    vcvt.s32.f32 s4, s0
 ; VFP2-NEXT:    vcmp.f32 s0, s2
-; VFP2-NEXT:    vldr s2, .LCPI23_1
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s0, s2
+; VFP2-NEXT:    vcmp.f32 s0, s6
 ; VFP2-NEXT:    vmov r0, s4
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movwlt r0, #32768
@@ -3161,11 +2954,11 @@ define i19 @test_signed_i19_f16(half %f) nounwind {
 ; VFP2-NEXT:    bl __aeabi_h2f
 ; VFP2-NEXT:    vmov s0, r0
 ; VFP2-NEXT:    vldr s2, .LCPI24_0
+; VFP2-NEXT:    vldr s6, .LCPI24_1
 ; VFP2-NEXT:    vcvt.s32.f32 s4, s0
 ; VFP2-NEXT:    vcmp.f32 s0, s2
-; VFP2-NEXT:    vldr s2, .LCPI24_1
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s0, s2
+; VFP2-NEXT:    vcmp.f32 s0, s6
 ; VFP2-NEXT:    vmov r0, s4
 ; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movlt r0, #0
@@ -3357,40 +3150,32 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 ;
 ; VFP2-LABEL: test_signed_i50_f16:
 ; VFP2:       @ %bb.0:
-; VFP2-NEXT:    .save {r7, lr}
-; VFP2-NEXT:    push {r7, lr}
-; VFP2-NEXT:    .vsave {d8}
-; VFP2-NEXT:    vpush {d8}
+; VFP2-NEXT:    .save {r4, lr}
+; VFP2-NEXT:    push {r4, lr}
 ; VFP2-NEXT:    bl __aeabi_h2f
-; VFP2-NEXT:    vmov s16, r0
+; VFP2-NEXT:    mov r4, r0
 ; VFP2-NEXT:    bl __aeabi_f2lz
 ; VFP2-NEXT:    vldr s0, .LCPI26_0
-; VFP2-NEXT:    vldr s2, .LCPI26_1
-; VFP2-NEXT:    vcmp.f32 s16, s0
+; VFP2-NEXT:    vmov s2, r4
+; VFP2-NEXT:    vldr s4, .LCPI26_1
+; VFP2-NEXT:    vcmp.f32 s2, s0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, s2
-; VFP2-NEXT:    itt lt
+; VFP2-NEXT:    ittt lt
 ; VFP2-NEXT:    movlt r1, #0
 ; VFP2-NEXT:    movtlt r1, #65534
+; VFP2-NEXT:    movlt r0, #0
+; VFP2-NEXT:    vcmp.f32 s2, s4
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, s0
-; VFP2-NEXT:    itt gt
+; VFP2-NEXT:    ittt gt
 ; VFP2-NEXT:    movwgt r1, #65535
 ; VFP2-NEXT:    movtgt r1, #1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vcmp.f32 s16, s2
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vcmp.f32 s16, s16
+; VFP2-NEXT:    vcmp.f32 s2, s2
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP2-NEXT:    itt vs
 ; VFP2-NEXT:    movvs r0, #0
 ; VFP2-NEXT:    movvs r1, #0
-; VFP2-NEXT:    vpop {d8}
-; VFP2-NEXT:    pop {r7, pc}
+; VFP2-NEXT:    pop {r4, pc}
 ; VFP2-NEXT:    .p2align 2
 ; VFP2-NEXT:  @ %bb.1:
 ; VFP2-NEXT:  .LCPI26_0:
@@ -3412,21 +3197,15 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 ; FP16-NEXT:    vldr s2, .LCPI26_1
 ; FP16-NEXT:    vcmp.f32 s16, s0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s2
-; FP16-NEXT:    itt lt
+; FP16-NEXT:    ittt lt
 ; FP16-NEXT:    movlt r1, #0
 ; FP16-NEXT:    movtlt r1, #65534
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    itt gt
-; FP16-NEXT:    movwgt r1, #65535
-; FP16-NEXT:    movtgt r1, #1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    vcmp.f32 s16, s2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    ittt gt
+; FP16-NEXT:    movwgt r1, #65535
+; FP16-NEXT:    movtgt r1, #1
 ; FP16-NEXT:    movgt.w r0, #-1
 ; FP16-NEXT:    vcmp.f32 s16, s16
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
@@ -3531,27 +3310,18 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
 ; VFP2-NEXT:    vldr s4, .LCPI27_1
 ; VFP2-NEXT:    vcmp.f32 s2, s0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s4
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itt lt
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s2
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s0
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r0, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt.w r1, #-2147483648
 ; VFP2-NEXT:    vcmp.f32 s2, s4
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itt gt
 ; VFP2-NEXT:    mvngt r1, #-2147483648
+; VFP2-NEXT:    movgt.w r0, #-1
 ; VFP2-NEXT:    vcmp.f32 s2, s2
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
+; VFP2-NEXT:    itt vs
+; VFP2-NEXT:    movvs r0, #0
 ; VFP2-NEXT:    movvs r1, #0
 ; VFP2-NEXT:    pop {r4, pc}
 ; VFP2-NEXT:    .p2align 2
@@ -3575,27 +3345,18 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
 ; FP16-NEXT:    vldr s2, .LCPI27_1
 ; FP16-NEXT:    vcmp.f32 s16, s0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s2
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itt lt
 ; FP16-NEXT:    movlt r0, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s16
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r0, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt.w r1, #-2147483648
 ; FP16-NEXT:    vcmp.f32 s16, s2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itt gt
 ; FP16-NEXT:    mvngt r1, #-2147483648
+; FP16-NEXT:    movgt.w r0, #-1
 ; FP16-NEXT:    vcmp.f32 s16, s16
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
+; FP16-NEXT:    itt vs
+; FP16-NEXT:    movvs r0, #0
 ; FP16-NEXT:    movvs r1, #0
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
@@ -3737,51 +3498,24 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; VFP2-NEXT:    vldr s4, .LCPI28_1
 ; VFP2-NEXT:    vcmp.f32 s2, s0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s4
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itttt lt
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s2
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s0
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r0, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s4
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s2
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r1, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s0
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r1, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s4
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r2, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s2
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r2, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s0
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r2, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    mvnlt r3, #7
 ; VFP2-NEXT:    vcmp.f32 s2, s4
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itttt gt
 ; VFP2-NEXT:    movgt r3, #7
+; VFP2-NEXT:    movgt.w r2, #-1
+; VFP2-NEXT:    movgt.w r1, #-1
+; VFP2-NEXT:    movgt.w r0, #-1
 ; VFP2-NEXT:    vcmp.f32 s2, s2
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
+; VFP2-NEXT:    itttt vs
+; VFP2-NEXT:    movvs r0, #0
+; VFP2-NEXT:    movvs r1, #0
+; VFP2-NEXT:    movvs r2, #0
 ; VFP2-NEXT:    movvs r3, #0
 ; VFP2-NEXT:    pop {r4, pc}
 ; VFP2-NEXT:    .p2align 2
@@ -3805,51 +3539,24 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; FP16-NEXT:    vldr s2, .LCPI28_1
 ; FP16-NEXT:    vcmp.f32 s16, s0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s2
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itttt lt
 ; FP16-NEXT:    movlt r0, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s16
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r0, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s2
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s16
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r1, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s2
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s16
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r2, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    mvnlt r3, #7
 ; FP16-NEXT:    vcmp.f32 s16, s2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itttt gt
 ; FP16-NEXT:    movgt r3, #7
+; FP16-NEXT:    movgt.w r2, #-1
+; FP16-NEXT:    movgt.w r1, #-1
+; FP16-NEXT:    movgt.w r0, #-1
 ; FP16-NEXT:    vcmp.f32 s16, s16
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
+; FP16-NEXT:    itttt vs
+; FP16-NEXT:    movvs r0, #0
+; FP16-NEXT:    movvs r1, #0
+; FP16-NEXT:    movvs r2, #0
 ; FP16-NEXT:    movvs r3, #0
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
@@ -3994,51 +3701,24 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; VFP2-NEXT:    vldr s4, .LCPI29_1
 ; VFP2-NEXT:    vcmp.f32 s2, s0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s4
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itttt lt
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s2
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s0
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r0, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s4
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s2
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r1, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s0
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r1, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s4
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt r2, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s2
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt.w r2, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s2, s0
-; VFP2-NEXT:    it vs
-; VFP2-NEXT:    movvs r2, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
 ; VFP2-NEXT:    movlt.w r3, #-2147483648
 ; VFP2-NEXT:    vcmp.f32 s2, s4
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itttt gt
 ; VFP2-NEXT:    mvngt r3, #-2147483648
+; VFP2-NEXT:    movgt.w r2, #-1
+; VFP2-NEXT:    movgt.w r1, #-1
+; VFP2-NEXT:    movgt.w r0, #-1
 ; VFP2-NEXT:    vcmp.f32 s2, s2
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it vs
+; VFP2-NEXT:    itttt vs
+; VFP2-NEXT:    movvs r0, #0
+; VFP2-NEXT:    movvs r1, #0
+; VFP2-NEXT:    movvs r2, #0
 ; VFP2-NEXT:    movvs r3, #0
 ; VFP2-NEXT:    pop {r4, pc}
 ; VFP2-NEXT:    .p2align 2
@@ -4062,51 +3742,24 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; FP16-NEXT:    vldr s2, .LCPI29_1
 ; FP16-NEXT:    vcmp.f32 s16, s0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s2
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itttt lt
 ; FP16-NEXT:    movlt r0, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s16
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r0, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s2
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s16
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r1, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s2
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s16
-; FP16-NEXT:    it gt
-; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it vs
-; FP16-NEXT:    movvs r2, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
 ; FP16-NEXT:    movlt.w r3, #-2147483648
 ; FP16-NEXT:    vcmp.f32 s16, s2
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itttt gt
 ; FP16-NEXT:    mvngt r3, #-2147483648
+; FP16-NEXT:    movgt.w r2, #-1
+; FP16-NEXT:    movgt.w r1, #-1
+; FP16-NEXT:    movgt.w r0, #-1
 ; FP16-NEXT:    vcmp.f32 s16, s16
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it vs
+; FP16-NEXT:    itttt vs
+; FP16-NEXT:    movvs r0, #0
+; FP16-NEXT:    movvs r1, #0
+; FP16-NEXT:    movvs r2, #0
 ; FP16-NEXT:    movvs r3, #0
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/ARM/fptoui-sat-scalar.ll b/llvm/test/CodeGen/ARM/fptoui-sat-scalar.ll
index 3438fb113015c..14eb67104edda 100644
--- a/llvm/test/CodeGen/ARM/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/ARM/fptoui-sat-scalar.ll
@@ -503,8 +503,8 @@ define i50 @test_signed_i50_f32(float %f) nounwind {
 ; VFP-NEXT:    vcmp.f32 s16, #0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP-NEXT:    itt lt
-; VFP-NEXT:    movlt r0, #0
 ; VFP-NEXT:    movlt r1, #0
+; VFP-NEXT:    movlt r0, #0
 ; VFP-NEXT:    vcmp.f32 s16, s0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP-NEXT:    ittt gt
@@ -586,19 +586,13 @@ define i64 @test_signed_i64_f32(float %f) nounwind {
 ; VFP-NEXT:    vldr s0, .LCPI7_0
 ; VFP-NEXT:    vcmp.f32 s16, #0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
+; VFP-NEXT:    itt lt
+; VFP-NEXT:    movlt r1, #0
 ; VFP-NEXT:    movlt r0, #0
 ; VFP-NEXT:    vcmp.f32 s16, s0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, #0
-; VFP-NEXT:    it gt
+; VFP-NEXT:    itt gt
 ; VFP-NEXT:    movgt.w r0, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
-; VFP-NEXT:    movlt r1, #0
-; VFP-NEXT:    vcmp.f32 s16, s0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it gt
 ; VFP-NEXT:    movgt.w r1, #-1
 ; VFP-NEXT:    vpop {d8}
 ; VFP-NEXT:    pop {r7, pc}
@@ -701,35 +695,17 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ; VFP-NEXT:    vldr s0, .LCPI8_0
 ; VFP-NEXT:    vcmp.f32 s16, #0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
+; VFP-NEXT:    itttt lt
+; VFP-NEXT:    movlt r3, #0
+; VFP-NEXT:    movlt r2, #0
+; VFP-NEXT:    movlt r1, #0
 ; VFP-NEXT:    movlt r0, #0
 ; VFP-NEXT:    vcmp.f32 s16, s0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, #0
-; VFP-NEXT:    it gt
+; VFP-NEXT:    itttt gt
 ; VFP-NEXT:    movgt.w r0, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, s0
-; VFP-NEXT:    it lt
-; VFP-NEXT:    movlt r1, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, #0
-; VFP-NEXT:    it gt
 ; VFP-NEXT:    movgt.w r1, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, s0
-; VFP-NEXT:    it lt
-; VFP-NEXT:    movlt r2, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, #0
-; VFP-NEXT:    it gt
 ; VFP-NEXT:    movgt.w r2, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
-; VFP-NEXT:    movlt r3, #0
-; VFP-NEXT:    vcmp.f32 s16, s0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it gt
 ; VFP-NEXT:    movgt r3, #15
 ; VFP-NEXT:    vpop {d8}
 ; VFP-NEXT:    pop {r7, pc}
@@ -831,35 +807,17 @@ define i128 @test_signed_i128_f32(float %f) nounwind {
 ; VFP-NEXT:    vldr s0, .LCPI9_0
 ; VFP-NEXT:    vcmp.f32 s16, #0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
+; VFP-NEXT:    itttt lt
+; VFP-NEXT:    movlt r3, #0
+; VFP-NEXT:    movlt r2, #0
+; VFP-NEXT:    movlt r1, #0
 ; VFP-NEXT:    movlt r0, #0
 ; VFP-NEXT:    vcmp.f32 s16, s0
 ; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, #0
-; VFP-NEXT:    it gt
+; VFP-NEXT:    itttt gt
 ; VFP-NEXT:    movgt.w r0, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, s0
-; VFP-NEXT:    it lt
-; VFP-NEXT:    movlt r1, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, #0
-; VFP-NEXT:    it gt
 ; VFP-NEXT:    movgt.w r1, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, s0
-; VFP-NEXT:    it lt
-; VFP-NEXT:    movlt r2, #0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    vcmp.f32 s16, #0
-; VFP-NEXT:    it gt
 ; VFP-NEXT:    movgt.w r2, #-1
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it lt
-; VFP-NEXT:    movlt r3, #0
-; VFP-NEXT:    vcmp.f32 s16, s0
-; VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP-NEXT:    it gt
 ; VFP-NEXT:    movgt.w r3, #-1
 ; VFP-NEXT:    vpop {d8}
 ; VFP-NEXT:    pop {r7, pc}
@@ -1453,8 +1411,8 @@ define i50 @test_signed_i50_f64(double %f) nounwind {
 ; VFP2-NEXT:    vldr d16, .LCPI16_0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP2-NEXT:    itt lt
-; VFP2-NEXT:    movlt r0, #0
 ; VFP2-NEXT:    movlt r1, #0
+; VFP2-NEXT:    movlt r0, #0
 ; VFP2-NEXT:    vcmp.f64 d8, d16
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP2-NEXT:    ittt gt
@@ -1564,19 +1522,13 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; VFP2-NEXT:    vcmp.f64 d8, #0
 ; VFP2-NEXT:    vldr d16, .LCPI17_0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itt lt
+; VFP2-NEXT:    movlt r1, #0
 ; VFP2-NEXT:    movlt r0, #0
 ; VFP2-NEXT:    vcmp.f64 d8, d16
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itt gt
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vcmp.f64 d8, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vcmp.f64 d8, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r1, #-1
 ; VFP2-NEXT:    vpop {d8}
 ; VFP2-NEXT:    pop {r7, pc}
@@ -1597,19 +1549,13 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; FP16-NEXT:    vcmp.f64 d8, #0
 ; FP16-NEXT:    vldr d0, .LCPI17_0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itt lt
+; FP16-NEXT:    movlt r1, #0
 ; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    vcmp.f64 d8, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itt gt
 ; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vcmp.f64 d8, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vcmp.f64 d8, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r1, #-1
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
@@ -1719,35 +1665,17 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; VFP2-NEXT:    vcmp.f64 d8, #0
 ; VFP2-NEXT:    vldr d16, .LCPI18_0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itttt lt
+; VFP2-NEXT:    movlt r3, #0
+; VFP2-NEXT:    movlt r2, #0
+; VFP2-NEXT:    movlt r1, #0
 ; VFP2-NEXT:    movlt r0, #0
 ; VFP2-NEXT:    vcmp.f64 d8, d16
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itttt gt
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vcmp.f64 d8, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vcmp.f64 d8, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r1, #-1
-; VFP2-NEXT:    vcmp.f64 d8, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r2, #0
-; VFP2-NEXT:    vcmp.f64 d8, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r2, #-1
-; VFP2-NEXT:    vcmp.f64 d8, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r3, #0
-; VFP2-NEXT:    vcmp.f64 d8, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt r3, #15
 ; VFP2-NEXT:    vpop {d8}
 ; VFP2-NEXT:    pop {r7, pc}
@@ -1768,35 +1696,17 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; FP16-NEXT:    vcmp.f64 d8, #0
 ; FP16-NEXT:    vldr d0, .LCPI18_0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itttt lt
+; FP16-NEXT:    movlt r3, #0
+; FP16-NEXT:    movlt r2, #0
+; FP16-NEXT:    movlt r1, #0
 ; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    vcmp.f64 d8, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itttt gt
 ; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vcmp.f64 d8, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vcmp.f64 d8, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vcmp.f64 d8, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vcmp.f64 d8, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vcmp.f64 d8, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r3, #0
-; FP16-NEXT:    vcmp.f64 d8, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt r3, #15
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
@@ -1902,35 +1812,17 @@ define i128 @test_signed_i128_f64(double %f) nounwind {
 ; VFP2-NEXT:    vcmp.f64 d8, #0
 ; VFP2-NEXT:    vldr d16, .LCPI19_0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itttt lt
+; VFP2-NEXT:    movlt r3, #0
+; VFP2-NEXT:    movlt r2, #0
+; VFP2-NEXT:    movlt r1, #0
 ; VFP2-NEXT:    movlt r0, #0
 ; VFP2-NEXT:    vcmp.f64 d8, d16
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itttt gt
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vcmp.f64 d8, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vcmp.f64 d8, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r1, #-1
-; VFP2-NEXT:    vcmp.f64 d8, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r2, #0
-; VFP2-NEXT:    vcmp.f64 d8, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r2, #-1
-; VFP2-NEXT:    vcmp.f64 d8, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r3, #0
-; VFP2-NEXT:    vcmp.f64 d8, d16
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r3, #-1
 ; VFP2-NEXT:    vpop {d8}
 ; VFP2-NEXT:    pop {r7, pc}
@@ -1951,35 +1843,17 @@ define i128 @test_signed_i128_f64(double %f) nounwind {
 ; FP16-NEXT:    vcmp.f64 d8, #0
 ; FP16-NEXT:    vldr d0, .LCPI19_0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itttt lt
+; FP16-NEXT:    movlt r3, #0
+; FP16-NEXT:    movlt r2, #0
+; FP16-NEXT:    movlt r1, #0
 ; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    vcmp.f64 d8, d0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itttt gt
 ; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vcmp.f64 d8, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vcmp.f64 d8, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vcmp.f64 d8, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vcmp.f64 d8, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vcmp.f64 d8, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r3, #0
-; FP16-NEXT:    vcmp.f64 d8, d0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r3, #-1
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
@@ -2526,27 +2400,25 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 ;
 ; VFP2-LABEL: test_signed_i50_f16:
 ; VFP2:       @ %bb.0:
-; VFP2-NEXT:    .save {r7, lr}
-; VFP2-NEXT:    push {r7, lr}
-; VFP2-NEXT:    .vsave {d8}
-; VFP2-NEXT:    vpush {d8}
+; VFP2-NEXT:    .save {r4, lr}
+; VFP2-NEXT:    push {r4, lr}
 ; VFP2-NEXT:    bl __aeabi_h2f
-; VFP2-NEXT:    vmov s16, r0
+; VFP2-NEXT:    mov r4, r0
 ; VFP2-NEXT:    bl __aeabi_f2ulz
-; VFP2-NEXT:    vldr s0, .LCPI26_0
-; VFP2-NEXT:    vcmp.f32 s16, #0
+; VFP2-NEXT:    vmov s0, r4
+; VFP2-NEXT:    vldr s2, .LCPI26_0
+; VFP2-NEXT:    vcmp.f32 s0, #0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP2-NEXT:    itt lt
-; VFP2-NEXT:    movlt r0, #0
 ; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vcmp.f32 s16, s0
+; VFP2-NEXT:    movlt r0, #0
+; VFP2-NEXT:    vcmp.f32 s0, s2
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
 ; VFP2-NEXT:    ittt gt
 ; VFP2-NEXT:    movwgt r1, #65535
 ; VFP2-NEXT:    movtgt r1, #3
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vpop {d8}
-; VFP2-NEXT:    pop {r7, pc}
+; VFP2-NEXT:    pop {r4, pc}
 ; VFP2-NEXT:    .p2align 2
 ; VFP2-NEXT:  @ %bb.1:
 ; VFP2-NEXT:  .LCPI26_0:
@@ -2566,8 +2438,8 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 ; FP16-NEXT:    vcmp.f32 s16, #0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    itt lt
-; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    movlt r1, #0
+; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    vcmp.f32 s16, s0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
 ; FP16-NEXT:    ittt gt
@@ -2642,32 +2514,24 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
 ;
 ; VFP2-LABEL: test_signed_i64_f16:
 ; VFP2:       @ %bb.0:
-; VFP2-NEXT:    .save {r7, lr}
-; VFP2-NEXT:    push {r7, lr}
-; VFP2-NEXT:    .vsave {d8}
-; VFP2-NEXT:    vpush {d8}
+; VFP2-NEXT:    .save {r4, lr}
+; VFP2-NEXT:    push {r4, lr}
 ; VFP2-NEXT:    bl __aeabi_h2f
-; VFP2-NEXT:    vmov s16, r0
+; VFP2-NEXT:    mov r4, r0
 ; VFP2-NEXT:    bl __aeabi_f2ulz
-; VFP2-NEXT:    vldr s0, .LCPI27_0
-; VFP2-NEXT:    vcmp.f32 s16, #0
+; VFP2-NEXT:    vmov s0, r4
+; VFP2-NEXT:    vldr s2, .LCPI27_0
+; VFP2-NEXT:    vcmp.f32 s0, #0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itt lt
+; VFP2-NEXT:    movlt r1, #0
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vcmp.f32 s16, s0
+; VFP2-NEXT:    vcmp.f32 s0, s2
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, #0
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itt gt
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vcmp.f32 s16, s0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r1, #-1
-; VFP2-NEXT:    vpop {d8}
-; VFP2-NEXT:    pop {r7, pc}
+; VFP2-NEXT:    pop {r4, pc}
 ; VFP2-NEXT:    .p2align 2
 ; VFP2-NEXT:  @ %bb.1:
 ; VFP2-NEXT:  .LCPI27_0:
@@ -2686,19 +2550,13 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
 ; FP16-NEXT:    vldr s0, .LCPI27_0
 ; FP16-NEXT:    vcmp.f32 s16, #0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itt lt
+; FP16-NEXT:    movlt r1, #0
 ; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    vcmp.f32 s16, s0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, #0
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itt gt
 ; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r1, #-1
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
@@ -2794,48 +2652,28 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ;
 ; VFP2-LABEL: test_signed_i100_f16:
 ; VFP2:       @ %bb.0:
-; VFP2-NEXT:    .save {r7, lr}
-; VFP2-NEXT:    push {r7, lr}
-; VFP2-NEXT:    .vsave {d8}
-; VFP2-NEXT:    vpush {d8}
+; VFP2-NEXT:    .save {r4, lr}
+; VFP2-NEXT:    push {r4, lr}
 ; VFP2-NEXT:    bl __aeabi_h2f
-; VFP2-NEXT:    vmov s16, r0
+; VFP2-NEXT:    mov r4, r0
 ; VFP2-NEXT:    bl __fixunssfti
-; VFP2-NEXT:    vldr s0, .LCPI28_0
-; VFP2-NEXT:    vcmp.f32 s16, #0
+; VFP2-NEXT:    vmov s0, r4
+; VFP2-NEXT:    vldr s2, .LCPI28_0
+; VFP2-NEXT:    vcmp.f32 s0, #0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itttt lt
+; VFP2-NEXT:    movlt r3, #0
+; VFP2-NEXT:    movlt r2, #0
+; VFP2-NEXT:    movlt r1, #0
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vcmp.f32 s16, s0
+; VFP2-NEXT:    vcmp.f32 s0, s2
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, #0
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itttt gt
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, s0
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, #0
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r1, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, s0
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r2, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, #0
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r2, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r3, #0
-; VFP2-NEXT:    vcmp.f32 s16, s0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt r3, #15
-; VFP2-NEXT:    vpop {d8}
-; VFP2-NEXT:    pop {r7, pc}
+; VFP2-NEXT:    pop {r4, pc}
 ; VFP2-NEXT:    .p2align 2
 ; VFP2-NEXT:  @ %bb.1:
 ; VFP2-NEXT:  .LCPI28_0:
@@ -2854,35 +2692,17 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; FP16-NEXT:    vldr s0, .LCPI28_0
 ; FP16-NEXT:    vcmp.f32 s16, #0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itttt lt
+; FP16-NEXT:    movlt r3, #0
+; FP16-NEXT:    movlt r2, #0
+; FP16-NEXT:    movlt r1, #0
 ; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    vcmp.f32 s16, s0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, #0
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itttt gt
 ; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, #0
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, #0
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r3, #0
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt r3, #15
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
@@ -2977,48 +2797,28 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ;
 ; VFP2-LABEL: test_signed_i128_f16:
 ; VFP2:       @ %bb.0:
-; VFP2-NEXT:    .save {r7, lr}
-; VFP2-NEXT:    push {r7, lr}
-; VFP2-NEXT:    .vsave {d8}
-; VFP2-NEXT:    vpush {d8}
+; VFP2-NEXT:    .save {r4, lr}
+; VFP2-NEXT:    push {r4, lr}
 ; VFP2-NEXT:    bl __aeabi_h2f
-; VFP2-NEXT:    vmov s16, r0
+; VFP2-NEXT:    mov r4, r0
 ; VFP2-NEXT:    bl __fixunssfti
-; VFP2-NEXT:    vldr s0, .LCPI29_0
-; VFP2-NEXT:    vcmp.f32 s16, #0
+; VFP2-NEXT:    vmov s0, r4
+; VFP2-NEXT:    vldr s2, .LCPI29_0
+; VFP2-NEXT:    vcmp.f32 s0, #0
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
+; VFP2-NEXT:    itttt lt
+; VFP2-NEXT:    movlt r3, #0
+; VFP2-NEXT:    movlt r2, #0
+; VFP2-NEXT:    movlt r1, #0
 ; VFP2-NEXT:    movlt r0, #0
-; VFP2-NEXT:    vcmp.f32 s16, s0
+; VFP2-NEXT:    vcmp.f32 s0, s2
 ; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, #0
-; VFP2-NEXT:    it gt
+; VFP2-NEXT:    itttt gt
 ; VFP2-NEXT:    movgt.w r0, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, s0
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, #0
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r1, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, s0
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r2, #0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    vcmp.f32 s16, #0
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r2, #-1
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r3, #0
-; VFP2-NEXT:    vcmp.f32 s16, s0
-; VFP2-NEXT:    vmrs APSR_nzcv, fpscr
-; VFP2-NEXT:    it gt
 ; VFP2-NEXT:    movgt.w r3, #-1
-; VFP2-NEXT:    vpop {d8}
-; VFP2-NEXT:    pop {r7, pc}
+; VFP2-NEXT:    pop {r4, pc}
 ; VFP2-NEXT:    .p2align 2
 ; VFP2-NEXT:  @ %bb.1:
 ; VFP2-NEXT:  .LCPI29_0:
@@ -3037,35 +2837,17 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; FP16-NEXT:    vldr s0, .LCPI29_0
 ; FP16-NEXT:    vcmp.f32 s16, #0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
+; FP16-NEXT:    itttt lt
+; FP16-NEXT:    movlt r3, #0
+; FP16-NEXT:    movlt r2, #0
+; FP16-NEXT:    movlt r1, #0
 ; FP16-NEXT:    movlt r0, #0
 ; FP16-NEXT:    vcmp.f32 s16, s0
 ; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, #0
-; FP16-NEXT:    it gt
+; FP16-NEXT:    itttt gt
 ; FP16-NEXT:    movgt.w r0, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r1, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, #0
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r1, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r2, #0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    vcmp.f32 s16, #0
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r2, #-1
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it lt
-; FP16-NEXT:    movlt r3, #0
-; FP16-NEXT:    vcmp.f32 s16, s0
-; FP16-NEXT:    vmrs APSR_nzcv, fpscr
-; FP16-NEXT:    it gt
 ; FP16-NEXT:    movgt.w r3, #-1
 ; FP16-NEXT:    vpop {d8}
 ; FP16-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
index feb790821e875..4003af5d44be8 100644
--- a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
@@ -16,12 +16,12 @@ declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
 define float @fminnum32_intrinsic(float %x, float %y) {
 ; ARMV7-LABEL: fminnum32_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s0, r0
-; ARMV7-NEXT:    vmov s2, r1
-; ARMV7-NEXT:    vcmp.f32 s0, s2
+; ARMV7-NEXT:    vmov s0, r1
+; ARMV7-NEXT:    vmov s2, r0
+; ARMV7-NEXT:    vcmp.f32 s2, s0
 ; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovlt.f32 s2, s0
-; ARMV7-NEXT:    vmov r0, s2
+; ARMV7-NEXT:    vmovlt.f32 s0, s2
+; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fminnum32_intrinsic:
@@ -102,12 +102,12 @@ define float @fminnum32_non_zero_intrinsic(float %x) {
 define float @fmaxnum32_intrinsic(float %x, float %y) {
 ; ARMV7-LABEL: fmaxnum32_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s0, r0
-; ARMV7-NEXT:    vmov s2, r1
-; ARMV7-NEXT:    vcmp.f32 s0, s2
+; ARMV7-NEXT:    vmov s0, r1
+; ARMV7-NEXT:    vmov s2, r0
+; ARMV7-NEXT:    vcmp.f32 s2, s0
 ; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovgt.f32 s2, s0
-; ARMV7-NEXT:    vmov r0, s2
+; ARMV7-NEXT:    vmovgt.f32 s0, s2
+; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fmaxnum32_intrinsic:
@@ -160,12 +160,12 @@ define float @fmaxnum32_nsz_intrinsic(float %x, float %y) {
 define float @fmaxnum32_zero_intrinsic(float %x) {
 ; ARMV7-LABEL: fmaxnum32_zero_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s2, r0
-; ARMV7-NEXT:    vldr s0, .LCPI5_0
-; ARMV7-NEXT:    vcmp.f32 s2, #0
+; ARMV7-NEXT:    vmov s0, r0
+; ARMV7-NEXT:    vldr s2, .LCPI5_0
+; ARMV7-NEXT:    vcmp.f32 s0, #0
 ; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovgt.f32 s0, s2
-; ARMV7-NEXT:    vmov r0, s0
+; ARMV7-NEXT:    vmovgt.f32 s2, s0
+; ARMV7-NEXT:    vmov r0, s2
 ; ARMV7-NEXT:    bx lr
 ; ARMV7-NEXT:    .p2align 2
 ; ARMV7-NEXT:  @ %bb.1:
@@ -425,12 +425,12 @@ define double@fmaxnum64_nsz_intrinsic(double %x, double %y) {
 define double @fmaxnum64_zero_intrinsic(double %x) {
 ; ARMV7-LABEL: fmaxnum64_zero_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov d17, r0, r1
-; ARMV7-NEXT:    vcmp.f64 d17, #0
+; ARMV7-NEXT:    vmov d16, r0, r1
+; ARMV7-NEXT:    vcmp.f64 d16, #0
 ; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmov.i32 d16, #0x0
-; ARMV7-NEXT:    vmovgt.f64 d16, d17
-; ARMV7-NEXT:    vmov r0, r1, d16
+; ARMV7-NEXT:    vmov.i32 d17, #0x0
+; ARMV7-NEXT:    vmovgt.f64 d17, d16
+; ARMV7-NEXT:    vmov r0, r1, d17
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fmaxnum64_zero_intrinsic:
@@ -1065,18 +1065,18 @@ define <2 x double> @fminnumv264_one_zero_intrinsic(<2 x double> %x) {
 ;
 ; ARMV8M-LABEL: fminnumv264_one_zero_intrinsic:
 ; ARMV8M:       @ %bb.0:
-; ARMV8M-NEXT:    vmov d3, r2, r3
-; ARMV8M-NEXT:    vldr d1, .LCPI27_0
-; ARMV8M-NEXT:    vcmp.f64 d3, #0
+; ARMV8M-NEXT:    vmov d1, r2, r3
+; ARMV8M-NEXT:    vldr d0, .LCPI27_0
+; ARMV8M-NEXT:    vcmp.f64 d1, #0
 ; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
 ; ARMV8M-NEXT:    vmov d2, r0, r1
-; ARMV8M-NEXT:    vmov.f64 d0, #-1.000000e+00
-; ARMV8M-NEXT:    vcmp.f64 d0, d2
-; ARMV8M-NEXT:    vmovlt.f64 d1, d3
+; ARMV8M-NEXT:    vmov.f64 d3, #-1.000000e+00
+; ARMV8M-NEXT:    vcmp.f64 d3, d2
+; ARMV8M-NEXT:    vmovlt.f64 d0, d1
 ; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r2, r3, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d2, d0
-; ARMV8M-NEXT:    vmov r0, r1, d0
+; ARMV8M-NEXT:    vmov r2, r3, d0
+; ARMV8M-NEXT:    vselgt.f64 d1, d2, d3
+; ARMV8M-NEXT:    vmov r0, r1, d1
 ; ARMV8M-NEXT:    bx lr
 ; ARMV8M-NEXT:    .p2align 3
 ; ARMV8M-NEXT:  @ %bb.1:
@@ -1186,18 +1186,18 @@ define <2 x double> @fmaxnumv264_nsz_intrinsic(<2 x double> %x, <2 x double> %y)
 define <2 x double> @fmaxnumv264_zero_intrinsic(<2 x double> %x) {
 ; ARMV7-LABEL: fmaxnumv264_zero_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vldr d17, .LCPI30_0
-; ARMV7-NEXT:    vmov d18, r2, r3
-; ARMV7-NEXT:    vmov d19, r0, r1
-; ARMV7-NEXT:    vcmp.f64 d18, d17
+; ARMV7-NEXT:    vmov d18, r0, r1
+; ARMV7-NEXT:    vldr d16, .LCPI30_0
+; ARMV7-NEXT:    vcmp.f64 d18, #0
 ; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmov.i32 d16, #0x0
-; ARMV7-NEXT:    vcmp.f64 d19, #0
+; ARMV7-NEXT:    vmov d19, r2, r3
+; ARMV7-NEXT:    vcmp.f64 d19, d16
+; ARMV7-NEXT:    vmov.i32 d17, #0x0
 ; ARMV7-NEXT:    vmovgt.f64 d17, d18
 ; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmov r2, r3, d17
+; ARMV7-NEXT:    vmov r0, r1, d17
 ; ARMV7-NEXT:    vmovgt.f64 d16, d19
-; ARMV7-NEXT:    vmov r0, r1, d16
+; ARMV7-NEXT:    vmov r2, r3, d16
 ; ARMV7-NEXT:    bx lr
 ; ARMV7-NEXT:    .p2align 3
 ; ARMV7-NEXT:  @ %bb.1:
@@ -1225,26 +1225,26 @@ define <2 x double> @fmaxnumv264_zero_intrinsic(<2 x double> %x) {
 ; ARMV8M-LABEL: fmaxnumv264_zero_intrinsic:
 ; ARMV8M:       @ %bb.0:
 ; ARMV8M-NEXT:    vmov d2, r0, r1
-; ARMV8M-NEXT:    vldr d0, .LCPI30_0
+; ARMV8M-NEXT:    vldr d1, .LCPI30_1
 ; ARMV8M-NEXT:    vcmp.f64 d2, #0
 ; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
 ; ARMV8M-NEXT:    vmov d3, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d3, d0
-; ARMV8M-NEXT:    vldr d1, .LCPI30_1
-; ARMV8M-NEXT:    vselgt.f64 d1, d2, d1
+; ARMV8M-NEXT:    vcmp.f64 d3, d1
+; ARMV8M-NEXT:    vldr d0, .LCPI30_0
+; ARMV8M-NEXT:    vselgt.f64 d0, d2, d0
 ; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r0, r1, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d3, d0
-; ARMV8M-NEXT:    vmov r2, r3, d0
+; ARMV8M-NEXT:    vmov r0, r1, d0
+; ARMV8M-NEXT:    vselgt.f64 d1, d3, d1
+; ARMV8M-NEXT:    vmov r2, r3, d1
 ; ARMV8M-NEXT:    bx lr
 ; ARMV8M-NEXT:    .p2align 3
 ; ARMV8M-NEXT:  @ %bb.1:
 ; ARMV8M-NEXT:  .LCPI30_0:
-; ARMV8M-NEXT:    .long 0 @ double -0
-; ARMV8M-NEXT:    .long 2147483648
-; ARMV8M-NEXT:  .LCPI30_1:
 ; ARMV8M-NEXT:    .long 0 @ double 0
 ; ARMV8M-NEXT:    .long 0
+; ARMV8M-NEXT:  .LCPI30_1:
+; ARMV8M-NEXT:    .long 0 @ double -0
+; ARMV8M-NEXT:    .long 2147483648
   %a = call nnan <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double><double 0.0, double -0.0>)
   ret <2 x double> %a
 }
diff --git a/llvm/test/CodeGen/ARM/select.ll b/llvm/test/CodeGen/ARM/select.ll
index 24ca9aeac7f2d..496a6c0f5acbb 100644
--- a/llvm/test/CodeGen/ARM/select.ll
+++ b/llvm/test/CodeGen/ARM/select.ll
@@ -164,13 +164,13 @@ define double @f7(double %a, double %b) {
 ; CHECK-VFP-LABEL: f7:
 ; CHECK-VFP:       @ %bb.0:
 ; CHECK-VFP-NEXT:    vldr d17, .LCPI6_0
-; CHECK-VFP-NEXT:    vmov d19, r0, r1
+; CHECK-VFP-NEXT:    vmov d18, r0, r1
 ; CHECK-VFP-NEXT:    vmov.f64 d16, #-1.000000e+00
-; CHECK-VFP-NEXT:    vcmp.f64 d19, d17
+; CHECK-VFP-NEXT:    vcmp.f64 d18, d17
 ; CHECK-VFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-VFP-NEXT:    vmov d18, r2, r3
-; CHECK-VFP-NEXT:    vmovmi.f64 d18, d16
-; CHECK-VFP-NEXT:    vmov r0, r1, d18
+; CHECK-VFP-NEXT:    vmov d19, r2, r3
+; CHECK-VFP-NEXT:    vmovmi.f64 d19, d16
+; CHECK-VFP-NEXT:    vmov r0, r1, d19
 ; CHECK-VFP-NEXT:    bx lr
 ; CHECK-VFP-NEXT:    .p2align 3
 ; CHECK-VFP-NEXT:  @ %bb.1:
@@ -181,14 +181,14 @@ define double @f7(double %a, double %b) {
 ; CHECK-NEON-LABEL: f7:
 ; CHECK-NEON:       @ %bb.0:
 ; CHECK-NEON-NEXT:    vldr d17, LCPI6_0
-; CHECK-NEON-NEXT:    vmov d19, r0, r1
-; CHECK-NEON-NEXT:    vmov d18, r2, r3
-; CHECK-NEON-NEXT:    vcmp.f64 d19, d17
+; CHECK-NEON-NEXT:    vmov d18, r0, r1
+; CHECK-NEON-NEXT:    vmov d19, r2, r3
+; CHECK-NEON-NEXT:    vcmp.f64 d18, d17
 ; CHECK-NEON-NEXT:    vmov.f64 d16, #-1.000000e+00
 ; CHECK-NEON-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEON-NEXT:    it mi
-; CHECK-NEON-NEXT:    vmovmi.f64 d18, d16
-; CHECK-NEON-NEXT:    vmov r0, r1, d18
+; CHECK-NEON-NEXT:    vmovmi.f64 d19, d16
+; CHECK-NEON-NEXT:    vmov r0, r1, d19
 ; CHECK-NEON-NEXT:    bx lr
 ; CHECK-NEON-NEXT:    .p2align 3
 ; CHECK-NEON-NEXT:  @ %bb.1:
diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
index 8900d5f541e8a..b85cb3a4f191c 100644
--- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
@@ -628,13 +628,13 @@ define i1 @test_urem_larger(i63 %X) nounwind {
 ; ARM5-NEXT:    mla r0, r1, r12, r4
 ; ARM5-NEXT:    bic r0, r0, #-2147483648
 ; ARM5-NEXT:    lsrs r0, r0, #1
-; ARM5-NEXT:    rrx r1, r3
+; ARM5-NEXT:    rrx r2, r3
 ; ARM5-NEXT:    orr r0, r0, r3, lsl #30
 ; ARM5-NEXT:    ldr r3, .LCPI5_2
-; ARM5-NEXT:    bic r2, r0, #-2147483648
+; ARM5-NEXT:    bic r1, r0, #-2147483648
 ; ARM5-NEXT:    mov r0, #0
-; ARM5-NEXT:    subs r1, r1, r3
-; ARM5-NEXT:    sbcs r1, r2, #1
+; ARM5-NEXT:    subs r2, r2, r3
+; ARM5-NEXT:    sbcs r1, r1, #1
 ; ARM5-NEXT:    movlo r0, #1
 ; ARM5-NEXT:    pop {r4, pc}
 ; ARM5-NEXT:    .p2align 2
@@ -656,13 +656,13 @@ define i1 @test_urem_larger(i63 %X) nounwind {
 ; ARM6-NEXT:    mla r0, r1, r12, r0
 ; ARM6-NEXT:    bic r0, r0, #-2147483648
 ; ARM6-NEXT:    lsrs r0, r0, #1
-; ARM6-NEXT:    rrx r1, r3
+; ARM6-NEXT:    rrx r2, r3
 ; ARM6-NEXT:    orr r0, r0, r3, lsl #30
 ; ARM6-NEXT:    ldr r3, .LCPI5_2
-; ARM6-NEXT:    bic r2, r0, #-2147483648
+; ARM6-NEXT:    bic r1, r0, #-2147483648
 ; ARM6-NEXT:    mov r0, #0
-; ARM6-NEXT:    subs r1, r1, r3
-; ARM6-NEXT:    sbcs r1, r2, #1
+; ARM6-NEXT:    subs r2, r2, r3
+; ARM6-NEXT:    sbcs r1, r1, #1
 ; ARM6-NEXT:    movlo r0, #1
 ; ARM6-NEXT:    pop {r11, pc}
 ; ARM6-NEXT:    .p2align 2
@@ -686,14 +686,14 @@ define i1 @test_urem_larger(i63 %X) nounwind {
 ; ARM7-NEXT:    mla r0, r1, r12, r0
 ; ARM7-NEXT:    bic r0, r0, #-2147483648
 ; ARM7-NEXT:    lsrs r0, r0, #1
-; ARM7-NEXT:    rrx r1, r3
+; ARM7-NEXT:    rrx r2, r3
 ; ARM7-NEXT:    orr r0, r0, r3, lsl #30
 ; ARM7-NEXT:    movw r3, #24026
-; ARM7-NEXT:    bic r2, r0, #-2147483648
+; ARM7-NEXT:    bic r1, r0, #-2147483648
 ; ARM7-NEXT:    movt r3, #48461
-; ARM7-NEXT:    subs r1, r1, r3
+; ARM7-NEXT:    subs r2, r2, r3
 ; ARM7-NEXT:    mov r0, #0
-; ARM7-NEXT:    sbcs r1, r2, #1
+; ARM7-NEXT:    sbcs r1, r1, #1
 ; ARM7-NEXT:    movwlo r0, #1
 ; ARM7-NEXT:    pop {r11, pc}
 ;
@@ -709,14 +709,14 @@ define i1 @test_urem_larger(i63 %X) nounwind {
 ; ARM8-NEXT:    mla r0, r1, r12, r0
 ; ARM8-NEXT:    bic r0, r0, #-2147483648
 ; ARM8-NEXT:    lsrs r0, r0, #1
-; ARM8-NEXT:    rrx r1, r3
+; ARM8-NEXT:    rrx r2, r3
 ; ARM8-NEXT:    orr r0, r0, r3, lsl #30
 ; ARM8-NEXT:    movw r3, #24026
-; ARM8-NEXT:    bic r2, r0, #-2147483648
+; ARM8-NEXT:    bic r1, r0, #-2147483648
 ; ARM8-NEXT:    movt r3, #48461
-; ARM8-NEXT:    subs r1, r1, r3
+; ARM8-NEXT:    subs r2, r2, r3
 ; ARM8-NEXT:    mov r0, #0
-; ARM8-NEXT:    sbcs r1, r2, #1
+; ARM8-NEXT:    sbcs r1, r1, #1
 ; ARM8-NEXT:    movwlo r0, #1
 ; ARM8-NEXT:    pop {r11, pc}
 ;
@@ -732,14 +732,14 @@ define i1 @test_urem_larger(i63 %X) nounwind {
 ; NEON7-NEXT:    mla r0, r1, r12, r0
 ; NEON7-NEXT:    bic r0, r0, #-2147483648
 ; NEON7-NEXT:    lsrs r0, r0, #1
-; NEON7-NEXT:    rrx r1, r3
+; NEON7-NEXT:    rrx r2, r3
 ; NEON7-NEXT:    orr r0, r0, r3, lsl #30
 ; NEON7-NEXT:    movw r3, #24026
-; NEON7-NEXT:    bic r2, r0, #-2147483648
+; NEON7-NEXT:    bic r1, r0, #-2147483648
 ; NEON7-NEXT:    movt r3, #48461
-; NEON7-NEXT:    subs r1, r1, r3
+; NEON7-NEXT:    subs r2, r2, r3
 ; NEON7-NEXT:    mov r0, #0
-; NEON7-NEXT:    sbcs r1, r2, #1
+; NEON7-NEXT:    sbcs r1, r1, #1
 ; NEON7-NEXT:    movwlo r0, #1
 ; NEON7-NEXT:    pop {r11, pc}
 ;
@@ -755,14 +755,14 @@ define i1 @test_urem_larger(i63 %X) nounwind {
 ; NEON8-NEXT:    mla r0, r1, r12, r0
 ; NEON8-NEXT:    bic r0, r0, #-2147483648
 ; NEON8-NEXT:    lsrs r0, r0, #1
-; NEON8-NEXT:    rrx r1, r3
+; NEON8-NEXT:    rrx r2, r3
 ; NEON8-NEXT:    orr r0, r0, r3, lsl #30
 ; NEON8-NEXT:    movw r3, #24026
-; NEON8-NEXT:    bic r2, r0, #-2147483648
+; NEON8-NEXT:    bic r1, r0, #-2147483648
 ; NEON8-NEXT:    movt r3, #48461
-; NEON8-NEXT:    subs r1, r1, r3
+; NEON8-NEXT:    subs r2, r2, r3
 ; NEON8-NEXT:    mov r0, #0
-; NEON8-NEXT:    sbcs r1, r2, #1
+; NEON8-NEXT:    sbcs r1, r1, #1
 ; NEON8-NEXT:    movwlo r0, #1
 ; NEON8-NEXT:    pop {r11, pc}
   %urem = urem i63 %X, 1234567890
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
index fd894e0104c4e..754d5a25ca905 100644
--- a/llvm/test/CodeGen/DirectX/flatten-array.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -186,3 +186,6 @@ define void @global_gep_store() {
   store i32 1, i32* %3, align 4
   ret void
 }
+
+; Make sure we don't try to walk the body of a function declaration.
+declare void @opaque_function()
diff --git a/llvm/test/CodeGen/Hexagon/rdf-dce-double-cover.mir b/llvm/test/CodeGen/Hexagon/rdf-dce-double-cover.mir
new file mode 100644
index 0000000000000..3f3362b194eb6
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/rdf-dce-double-cover.mir
@@ -0,0 +1,53 @@
+# RUN: llc -march=hexagon -run-pass hexagon-rdf-opt -verify-machineinstrs %s -o - | FileCheck %s
+
+# Check that the L2_loadrd_io load from stack to $d6
+# register, in bb.0, is not considered as dead code by RDF
+# $d6 is used in A2_minp instruction in bb.1
+
+#CHECK-LABEL: bb.0
+#CHECK: renamable $d6 = L2_loadrd_io %stack.{{[0-9]+}}, 0
+
+--- |
+
+ define dso_local i32 @fred(ptr %a) local_unnamed_addr {
+   ret i32 0
+ }
+
+...
+---
+name: fred
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $d3:0x0000000000000003, $r5, $r8
+
+    renamable $p0 = C2_cmpgtui renamable $r8, 1
+    renamable $r8 = A2_addi killed renamable $r8, -1
+    renamable $d6 = L2_loadrd_io %stack.0, 0  :: (load (s64) from %stack.0)
+    renamable $r12, renamable $r5 = L2_loadri_pi killed renamable $r5, 4 :: (load (s32) from %ir.a)
+    J2_loop0r %bb.1, killed renamable $r8, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jumpf killed renamable $p0, %bb.2, implicit-def $pc
+    J2_jump %bb.1, implicit-def $pc
+
+  bb.1:
+    successors: %bb.2, %bb.1
+    liveins: $d3:0x0000000000000003, $d6:0x0000000000000003, $r5
+
+    renamable $d3 = A2_minp killed renamable $d3, renamable $d6
+    renamable $r12, renamable $r5 = L2_loadri_pi killed renamable $r5, 4 :: (load (s32) from %ir.a + 4)
+    ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+
+  bb.2:
+    liveins: $d3:0x0000000000000003, $d6:0x0000000000000003
+
+    renamable $r0 = A2_tfr renamable $r6
+    J2_jumpr $r31, implicit-def $pc, implicit $r0
+...
+
diff --git a/llvm/test/CodeGen/LoongArch/lsx/pr116008.ll b/llvm/test/CodeGen/LoongArch/lsx/pr116008.ll
new file mode 100644
index 0000000000000..ba8ffc3493189
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/pr116008.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define <4 x i32> @xor_shl_splat_vec_one(i32 %x, <4 x i32> %y) nounwind {
+; CHECK-LABEL: xor_shl_splat_vec_one:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vreplgr2vr.w $vr1, $a0
+; CHECK-NEXT:    vsll.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    vbitrevi.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins = insertelement <4 x i32> poison, i32 %x, i64 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+  %shl = shl <4 x i32> %splat, %y
+  %xor = xor <4 x i32> %shl, splat (i32 1)
+  ret <4 x i32> %xor
+}
diff --git a/llvm/test/CodeGen/Mips/fp16-promote.ll b/llvm/test/CodeGen/Mips/fp16-promote.ll
index c104ffb3c72eb..47bace9f5c03f 100644
--- a/llvm/test/CodeGen/Mips/fp16-promote.ll
+++ b/llvm/test/CodeGen/Mips/fp16-promote.ll
@@ -1,28 +1,51 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=CHECK-LIBCALL
+; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=MIPS32
+; RUN: llc -mtriple=mips64el-linux-gnu < %s | FileCheck %s -check-prefix=MIPS64
 
 define void @test_fadd(ptr %p, ptr %q) nounwind {
-; CHECK-LIBCALL-LABEL: test_fadd:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -32
-; CHECK-LIBCALL-NEXT:    sdc1 $f20, 24($sp) # 8-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    move $16, $4
-; CHECK-LIBCALL-NEXT:    lhu $4, 0($5)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    nop
-; CHECK-LIBCALL-NEXT:    lhu $4, 0($16)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
-; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    add.s $f12, $f0, $f20
-; CHECK-LIBCALL-NEXT:    sh $2, 0($16)
-; CHECK-LIBCALL-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    ldc1 $f20, 24($sp) # 8-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 32
+; MIPS32-LABEL: test_fadd:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    sdc1 $f20, 24($sp) # 8-byte Folded Spill
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $16, $4
+; MIPS32-NEXT:    lhu $4, 0($5)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lhu $4, 0($16)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    mov.s $f20, $f0
+; MIPS32-NEXT:    jal __gnu_f2h_ieee
+; MIPS32-NEXT:    add.s $f12, $f0, $f20
+; MIPS32-NEXT:    sh $2, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    ldc1 $f20, 24($sp) # 8-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-LABEL: test_fadd:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-NEXT:    sdc1 $f24, 24($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $ra, 16($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    move $16, $4
+; MIPS64-NEXT:    lhu $4, 0($5)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:    lhu $4, 0($16)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    mov.s $f24, $f0
+; MIPS64-NEXT:    jal __gnu_f2h_ieee
+; MIPS64-NEXT:    add.s $f12, $f0, $f24
+; MIPS64-NEXT:    sh $2, 0($16)
+; MIPS64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $ra, 16($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ldc1 $f24, 24($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 32
   %a = load half, ptr %p, align 2
   %b = load half, ptr %q, align 2
   %r = fadd half %a, %b
@@ -31,282 +54,515 @@ define void @test_fadd(ptr %p, ptr %q) nounwind {
 }
 
 define float @test_fpext_float(ptr %p) nounwind {
-; CHECK-LIBCALL-LABEL: test_fpext_float:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -24
-; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    lhu $4, 0($4)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    nop
-; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 24
+; MIPS32-LABEL: test_fpext_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    lhu $4, 0($4)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-LABEL: test_fpext_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -16
+; MIPS64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    lhu $4, 0($4)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 16
   %a = load half, ptr %p, align 2
   %r = fpext half %a to float
   ret float %r
 }
 
 define double @test_fpext_double(ptr %p) nounwind {
-; CHECK-LIBCALL-LABEL: test_fpext_double:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -24
-; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    lhu $4, 0($4)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    nop
-; CHECK-LIBCALL-NEXT:    cvt.d.s $f0, $f0
-; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 24
+; MIPS32-LABEL: test_fpext_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    lhu $4, 0($4)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    cvt.d.s $f0, $f0
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-LABEL: test_fpext_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -16
+; MIPS64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    lhu $4, 0($4)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:    cvt.d.s $f0, $f0
+; MIPS64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 16
   %a = load half, ptr %p, align 2
   %r = fpext half %a to double
   ret double %r
 }
 
 define void @test_fptrunc_float(float %f, ptr %p) nounwind {
-; CHECK-LIBCALL-LABEL: test_fptrunc_float:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -24
-; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    move $16, $5
-; CHECK-LIBCALL-NEXT:    sh $2, 0($16)
-; CHECK-LIBCALL-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 24
+; MIPS32-LABEL: test_fptrunc_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    jal __gnu_f2h_ieee
+; MIPS32-NEXT:    move $16, $5
+; MIPS32-NEXT:    sh $2, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-LABEL: test_fptrunc_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -16
+; MIPS64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $16, 0($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    jal __gnu_f2h_ieee
+; MIPS64-NEXT:    move $16, $5
+; MIPS64-NEXT:    sh $2, 0($16)
+; MIPS64-NEXT:    ld $16, 0($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 16
   %a = fptrunc float %f to half
   store half %a, ptr %p
   ret void
 }
 
 define void @test_fptrunc_double(double %d, ptr %p) nounwind {
-; CHECK-LIBCALL-LABEL: test_fptrunc_double:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -24
-; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
-; CHECK-LIBCALL-NEXT:    move $16, $6
-; CHECK-LIBCALL-NEXT:    sh $2, 0($16)
-; CHECK-LIBCALL-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 24
+; MIPS32-LABEL: test_fptrunc_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    jal __truncdfhf2
+; MIPS32-NEXT:    move $16, $6
+; MIPS32-NEXT:    sh $2, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-LABEL: test_fptrunc_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -16
+; MIPS64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $16, 0($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    jal __truncdfhf2
+; MIPS64-NEXT:    move $16, $5
+; MIPS64-NEXT:    sh $2, 0($16)
+; MIPS64-NEXT:    ld $16, 0($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 16
   %a = fptrunc double %d to half
   store half %a, ptr %p
   ret void
 }
 
 define <4 x float> @test_vec_fpext_float(ptr %p) nounwind {
-; CHECK-LIBCALL-LABEL: test_vec_fpext_float:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -32
-; CHECK-LIBCALL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    move $17, $4
-; CHECK-LIBCALL-NEXT:    lhu $4, 6($5)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    move $16, $5
-; CHECK-LIBCALL-NEXT:    lhu $4, 4($16)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    swc1 $f0, 12($17)
-; CHECK-LIBCALL-NEXT:    swc1 $f0, 8($17)
-; CHECK-LIBCALL-NEXT:    lhu $4, 2($16)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    nop
-; CHECK-LIBCALL-NEXT:    swc1 $f0, 4($17)
-; CHECK-LIBCALL-NEXT:    lhu $4, 0($16)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    nop
-; CHECK-LIBCALL-NEXT:    swc1 $f0, 0($17)
-; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 32
+; MIPS32-LABEL: test_vec_fpext_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $17, $4
+; MIPS32-NEXT:    lhu $4, 6($5)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    move $16, $5
+; MIPS32-NEXT:    lhu $4, 4($16)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    swc1 $f0, 12($17)
+; MIPS32-NEXT:    swc1 $f0, 8($17)
+; MIPS32-NEXT:    lhu $4, 2($16)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    swc1 $f0, 4($17)
+; MIPS32-NEXT:    lhu $4, 0($16)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    swc1 $f0, 0($17)
+; MIPS32-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-LABEL: test_vec_fpext_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $18, 16($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $17, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $16, 0($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    move $16, $4
+; MIPS64-NEXT:    lhu $4, 2($4)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:    lhu $4, 6($16)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    mfc1 $17, $f0
+; MIPS64-NEXT:    mfc1 $18, $f0
+; MIPS64-NEXT:    lhu $4, 0($16)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    dsll $17, $17, 32
+; MIPS64-NEXT:    mfc1 $1, $f0
+; MIPS64-NEXT:    dsll $1, $1, 32
+; MIPS64-NEXT:    dsrl $1, $1, 32
+; MIPS64-NEXT:    or $17, $1, $17
+; MIPS64-NEXT:    lhu $4, 4($16)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    dsll $18, $18, 32
+; MIPS64-NEXT:    mfc1 $1, $f0
+; MIPS64-NEXT:    dsll $1, $1, 32
+; MIPS64-NEXT:    dsrl $1, $1, 32
+; MIPS64-NEXT:    or $3, $1, $18
+; MIPS64-NEXT:    move $2, $17
+; MIPS64-NEXT:    ld $16, 0($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $17, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $18, 16($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 32
   %a = load <4 x half>, ptr %p, align 8
   %b = fpext <4 x half> %a to <4 x float>
   ret <4 x float> %b
 }
 
 define <4 x double> @test_vec_fpext_double(ptr %p) nounwind {
-; CHECK-LIBCALL-LABEL: test_vec_fpext_double:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -40
-; CHECK-LIBCALL-NEXT:    sdc1 $f20, 32($sp) # 8-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    move $17, $4
-; CHECK-LIBCALL-NEXT:    lhu $4, 6($5)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    move $16, $5
-; CHECK-LIBCALL-NEXT:    lhu $4, 4($16)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
-; CHECK-LIBCALL-NEXT:    lhu $4, 2($16)
-; CHECK-LIBCALL-NEXT:    cvt.d.s $f0, $f0
-; CHECK-LIBCALL-NEXT:    cvt.d.s $f2, $f20
-; CHECK-LIBCALL-NEXT:    sdc1 $f2, 24($17)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    sdc1 $f0, 16($17)
-; CHECK-LIBCALL-NEXT:    cvt.d.s $f0, $f0
-; CHECK-LIBCALL-NEXT:    sdc1 $f0, 8($17)
-; CHECK-LIBCALL-NEXT:    lhu $4, 0($16)
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    nop
-; CHECK-LIBCALL-NEXT:    cvt.d.s $f0, $f0
-; CHECK-LIBCALL-NEXT:    sdc1 $f0, 0($17)
-; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    ldc1 $f20, 32($sp) # 8-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 40
+; MIPS32-LABEL: test_vec_fpext_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -40
+; MIPS32-NEXT:    sdc1 $f20, 32($sp) # 8-byte Folded Spill
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $17, $4
+; MIPS32-NEXT:    lhu $4, 6($5)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    move $16, $5
+; MIPS32-NEXT:    lhu $4, 4($16)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    mov.s $f20, $f0
+; MIPS32-NEXT:    lhu $4, 2($16)
+; MIPS32-NEXT:    cvt.d.s $f0, $f0
+; MIPS32-NEXT:    cvt.d.s $f2, $f20
+; MIPS32-NEXT:    sdc1 $f2, 24($17)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    sdc1 $f0, 16($17)
+; MIPS32-NEXT:    cvt.d.s $f0, $f0
+; MIPS32-NEXT:    sdc1 $f0, 8($17)
+; MIPS32-NEXT:    lhu $4, 0($16)
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:    cvt.d.s $f0, $f0
+; MIPS32-NEXT:    sdc1 $f0, 0($17)
+; MIPS32-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    ldc1 $f20, 32($sp) # 8-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 40
+;
+; MIPS64-LABEL: test_vec_fpext_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-NEXT:    sdc1 $f24, 24($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $ra, 16($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $17, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $16, 0($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    move $17, $4
+; MIPS64-NEXT:    lhu $4, 6($5)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    move $16, $5
+; MIPS64-NEXT:    lhu $4, 4($16)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    mov.s $f24, $f0
+; MIPS64-NEXT:    lhu $4, 2($16)
+; MIPS64-NEXT:    cvt.d.s $f0, $f0
+; MIPS64-NEXT:    cvt.d.s $f1, $f24
+; MIPS64-NEXT:    sdc1 $f1, 24($17)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    sdc1 $f0, 16($17)
+; MIPS64-NEXT:    cvt.d.s $f0, $f0
+; MIPS64-NEXT:    sdc1 $f0, 8($17)
+; MIPS64-NEXT:    lhu $4, 0($16)
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:    cvt.d.s $f0, $f0
+; MIPS64-NEXT:    sdc1 $f0, 0($17)
+; MIPS64-NEXT:    ld $16, 0($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $17, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $ra, 16($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ldc1 $f24, 24($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 32
   %a = load <4 x half>, ptr %p, align 8
   %b = fpext <4 x half> %a to <4 x double>
   ret <4 x double> %b
 }
 
 define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) nounwind {
-; CHECK-LIBCALL-LABEL: test_vec_fptrunc_float:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -40
-; CHECK-LIBCALL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    move $16, $7
-; CHECK-LIBCALL-NEXT:    move $17, $5
-; CHECK-LIBCALL-NEXT:    move $18, $4
-; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    mtc1 $6, $f12
-; CHECK-LIBCALL-NEXT:    move $19, $2
-; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    mtc1 $16, $f12
-; CHECK-LIBCALL-NEXT:    mtc1 $17, $f12
-; CHECK-LIBCALL-NEXT:    lw $16, 56($sp)
-; CHECK-LIBCALL-NEXT:    sh $2, 6($16)
-; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    sh $19, 4($16)
-; CHECK-LIBCALL-NEXT:    sh $2, 2($16)
-; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    mtc1 $18, $f12
-; CHECK-LIBCALL-NEXT:    sh $2, 0($16)
-; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 40
+; MIPS32-LABEL: test_vec_fptrunc_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -40
+; MIPS32-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $19, 32($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $18, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $16, $7
+; MIPS32-NEXT:    move $17, $5
+; MIPS32-NEXT:    move $18, $4
+; MIPS32-NEXT:    jal __gnu_f2h_ieee
+; MIPS32-NEXT:    mtc1 $6, $f12
+; MIPS32-NEXT:    move $19, $2
+; MIPS32-NEXT:    jal __gnu_f2h_ieee
+; MIPS32-NEXT:    mtc1 $16, $f12
+; MIPS32-NEXT:    mtc1 $17, $f12
+; MIPS32-NEXT:    lw $16, 56($sp)
+; MIPS32-NEXT:    sh $2, 6($16)
+; MIPS32-NEXT:    jal __gnu_f2h_ieee
+; MIPS32-NEXT:    sh $19, 4($16)
+; MIPS32-NEXT:    sh $2, 2($16)
+; MIPS32-NEXT:    jal __gnu_f2h_ieee
+; MIPS32-NEXT:    mtc1 $18, $f12
+; MIPS32-NEXT:    sh $2, 0($16)
+; MIPS32-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $18, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $19, 32($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 40
+;
+; MIPS64-LABEL: test_vec_fptrunc_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -48
+; MIPS64-NEXT:    sd $ra, 40($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $19, 32($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $18, 24($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $17, 16($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    move $16, $6
+; MIPS64-NEXT:    move $17, $5
+; MIPS64-NEXT:    move $18, $4
+; MIPS64-NEXT:    sll $1, $18, 0
+; MIPS64-NEXT:    jal __gnu_f2h_ieee
+; MIPS64-NEXT:    mtc1 $1, $f12
+; MIPS64-NEXT:    move $19, $2
+; MIPS64-NEXT:    sll $1, $17, 0
+; MIPS64-NEXT:    jal __gnu_f2h_ieee
+; MIPS64-NEXT:    mtc1 $1, $f12
+; MIPS64-NEXT:    dsrl $1, $17, 32
+; MIPS64-NEXT:    sll $1, $1, 0
+; MIPS64-NEXT:    mtc1 $1, $f12
+; MIPS64-NEXT:    sh $2, 4($16)
+; MIPS64-NEXT:    jal __gnu_f2h_ieee
+; MIPS64-NEXT:    sh $19, 0($16)
+; MIPS64-NEXT:    sh $2, 6($16)
+; MIPS64-NEXT:    dsrl $1, $18, 32
+; MIPS64-NEXT:    sll $1, $1, 0
+; MIPS64-NEXT:    jal __gnu_f2h_ieee
+; MIPS64-NEXT:    mtc1 $1, $f12
+; MIPS64-NEXT:    sh $2, 2($16)
+; MIPS64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $17, 16($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $18, 24($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $19, 32($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $ra, 40($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 48
   %b = fptrunc <4 x float> %a to <4 x half>
   store <4 x half> %b, ptr %p, align 8
   ret void
 }
 
 define void @test_vec_fptrunc_double(<4 x double> %a, ptr %p) nounwind {
-; CHECK-LIBCALL-LABEL: test_vec_fptrunc_double:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -72
-; CHECK-LIBCALL-NEXT:    sw $ra, 68($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $19, 64($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $18, 60($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $17, 56($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 52($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    move $16, $5
-; CHECK-LIBCALL-NEXT:    move $17, $4
-; CHECK-LIBCALL-NEXT:    lw $1, 92($sp)
-; CHECK-LIBCALL-NEXT:    sw $1, 36($sp)
-; CHECK-LIBCALL-NEXT:    lw $1, 88($sp)
-; CHECK-LIBCALL-NEXT:    sw $1, 32($sp)
-; CHECK-LIBCALL-NEXT:    lw $1, 96($sp)
-; CHECK-LIBCALL-NEXT:    lw $2, 100($sp)
-; CHECK-LIBCALL-NEXT:    sw $2, 44($sp)
-; CHECK-LIBCALL-NEXT:    sw $1, 40($sp)
-; CHECK-LIBCALL-NEXT:    ldc1 $f12, 32($sp)
-; CHECK-LIBCALL-NEXT:    sw $7, 28($sp)
-; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
-; CHECK-LIBCALL-NEXT:    sw $6, 24($sp)
-; CHECK-LIBCALL-NEXT:    move $18, $2
-; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
-; CHECK-LIBCALL-NEXT:    ldc1 $f12, 40($sp)
-; CHECK-LIBCALL-NEXT:    ldc1 $f12, 24($sp)
-; CHECK-LIBCALL-NEXT:    lw $19, 104($sp)
-; CHECK-LIBCALL-NEXT:    sh $2, 6($19)
-; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
-; CHECK-LIBCALL-NEXT:    sh $18, 4($19)
-; CHECK-LIBCALL-NEXT:    sh $2, 2($19)
-; CHECK-LIBCALL-NEXT:    sw $16, 20($sp)
-; CHECK-LIBCALL-NEXT:    sw $17, 16($sp)
-; CHECK-LIBCALL-NEXT:    jal __truncdfhf2
-; CHECK-LIBCALL-NEXT:    ldc1 $f12, 16($sp)
-; CHECK-LIBCALL-NEXT:    sh $2, 0($19)
-; CHECK-LIBCALL-NEXT:    lw $16, 52($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $17, 56($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $18, 60($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $19, 64($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 68($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 72
+; MIPS32-LABEL: test_vec_fptrunc_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -72
+; MIPS32-NEXT:    sw $ra, 68($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $19, 64($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $18, 60($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $17, 56($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 52($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $16, $5
+; MIPS32-NEXT:    move $17, $4
+; MIPS32-NEXT:    lw $1, 92($sp)
+; MIPS32-NEXT:    sw $1, 36($sp)
+; MIPS32-NEXT:    lw $1, 88($sp)
+; MIPS32-NEXT:    sw $1, 32($sp)
+; MIPS32-NEXT:    lw $1, 96($sp)
+; MIPS32-NEXT:    lw $2, 100($sp)
+; MIPS32-NEXT:    sw $2, 44($sp)
+; MIPS32-NEXT:    sw $1, 40($sp)
+; MIPS32-NEXT:    ldc1 $f12, 32($sp)
+; MIPS32-NEXT:    sw $7, 28($sp)
+; MIPS32-NEXT:    jal __truncdfhf2
+; MIPS32-NEXT:    sw $6, 24($sp)
+; MIPS32-NEXT:    move $18, $2
+; MIPS32-NEXT:    jal __truncdfhf2
+; MIPS32-NEXT:    ldc1 $f12, 40($sp)
+; MIPS32-NEXT:    ldc1 $f12, 24($sp)
+; MIPS32-NEXT:    lw $19, 104($sp)
+; MIPS32-NEXT:    sh $2, 6($19)
+; MIPS32-NEXT:    jal __truncdfhf2
+; MIPS32-NEXT:    sh $18, 4($19)
+; MIPS32-NEXT:    sh $2, 2($19)
+; MIPS32-NEXT:    sw $16, 20($sp)
+; MIPS32-NEXT:    sw $17, 16($sp)
+; MIPS32-NEXT:    jal __truncdfhf2
+; MIPS32-NEXT:    ldc1 $f12, 16($sp)
+; MIPS32-NEXT:    sh $2, 0($19)
+; MIPS32-NEXT:    lw $16, 52($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $17, 56($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $18, 60($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $19, 64($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 68($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 72
+;
+; MIPS64-LABEL: test_vec_fptrunc_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -48
+; MIPS64-NEXT:    sd $ra, 40($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $20, 32($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $19, 24($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $18, 16($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $17, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $16, 0($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    move $16, $8
+; MIPS64-NEXT:    move $17, $7
+; MIPS64-NEXT:    move $18, $5
+; MIPS64-NEXT:    move $19, $4
+; MIPS64-NEXT:    jal __truncdfhf2
+; MIPS64-NEXT:    dmtc1 $6, $f12
+; MIPS64-NEXT:    move $20, $2
+; MIPS64-NEXT:    jal __truncdfhf2
+; MIPS64-NEXT:    dmtc1 $17, $f12
+; MIPS64-NEXT:    dmtc1 $18, $f12
+; MIPS64-NEXT:    sh $2, 6($16)
+; MIPS64-NEXT:    jal __truncdfhf2
+; MIPS64-NEXT:    sh $20, 4($16)
+; MIPS64-NEXT:    sh $2, 2($16)
+; MIPS64-NEXT:    jal __truncdfhf2
+; MIPS64-NEXT:    dmtc1 $19, $f12
+; MIPS64-NEXT:    sh $2, 0($16)
+; MIPS64-NEXT:    ld $16, 0($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $17, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $18, 16($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $19, 24($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $20, 32($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $ra, 40($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 48
   %b = fptrunc <4 x double> %a to <4 x half>
   store <4 x half> %b, ptr %p, align 8
   ret void
 }
 
 define half @test_fadd_fadd(half %a, half %b, half %c) nounwind {
-; CHECK-LIBCALL-LABEL: test_fadd_fadd:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -40
-; CHECK-LIBCALL-NEXT:    sdc1 $f20, 32($sp) # 8-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    move $16, $6
-; CHECK-LIBCALL-NEXT:    move $17, $4
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    move $4, $5
-; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    move $4, $17
-; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    add.s $f12, $f0, $f20
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    move $4, $2
-; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
-; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    move $4, $16
-; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    add.s $f12, $f20, $f0
-; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    ldc1 $f20, 32($sp) # 8-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 40
+; MIPS32-LABEL: test_fadd_fadd:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    addiu $sp, $sp, -40
+; MIPS32-NEXT:    sdc1 $f20, 32($sp) # 8-byte Folded Spill
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    move $16, $6
+; MIPS32-NEXT:    move $17, $4
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    move $4, $5
+; MIPS32-NEXT:    mov.s $f20, $f0
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    move $4, $17
+; MIPS32-NEXT:    jal __gnu_f2h_ieee
+; MIPS32-NEXT:    add.s $f12, $f0, $f20
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    move $4, $2
+; MIPS32-NEXT:    mov.s $f20, $f0
+; MIPS32-NEXT:    jal __gnu_h2f_ieee
+; MIPS32-NEXT:    move $4, $16
+; MIPS32-NEXT:    jal __gnu_f2h_ieee
+; MIPS32-NEXT:    add.s $f12, $f20, $f0
+; MIPS32-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    ldc1 $f20, 32($sp) # 8-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 40
+;
+; MIPS64-LABEL: test_fadd_fadd:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-NEXT:    sdc1 $f24, 24($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $ra, 16($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $17, 8($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    sd $16, 0($sp) # 8-byte Folded Spill
+; MIPS64-NEXT:    move $16, $6
+; MIPS64-NEXT:    move $17, $4
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    sll $4, $5, 0
+; MIPS64-NEXT:    mov.s $f24, $f0
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    sll $4, $17, 0
+; MIPS64-NEXT:    jal __gnu_f2h_ieee
+; MIPS64-NEXT:    add.s $f12, $f0, $f24
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    sll $4, $2, 0
+; MIPS64-NEXT:    mov.s $f24, $f0
+; MIPS64-NEXT:    jal __gnu_h2f_ieee
+; MIPS64-NEXT:    sll $4, $16, 0
+; MIPS64-NEXT:    jal __gnu_f2h_ieee
+; MIPS64-NEXT:    add.s $f12, $f24, $f0
+; MIPS64-NEXT:    ld $16, 0($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $17, 8($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ld $ra, 16($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    ldc1 $f24, 24($sp) # 8-byte Folded Reload
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    daddiu $sp, $sp, 32
   %d = fadd half %a, %b
   %e = fadd half %d, %c
   ret half %e
 }
 
 define half @to_half(i16 %bits) nounwind {
-; CHECK-LIBCALL-LABEL: to_half:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    move $2, $4
+; MIPS32-LABEL: to_half:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    move $2, $4
+;
+; MIPS64-LABEL: to_half:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    sll $2, $4, 0
   %f = bitcast i16 %bits to half
   ret half %f
 }
 
 define i16 @from_half(half %f) nounwind {
-; CHECK-LIBCALL-LABEL: from_half:
-; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    move $2, $4
+; MIPS32-LABEL: from_half:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    move $2, $4
+;
+; MIPS64-LABEL: from_half:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    sll $2, $4, 0
   %bits = bitcast half %f to i16
   ret i16 %bits
 }
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
index a53c90ac6db8b..3e54aaf558072 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | %ptxas-verify -arch=sm_80 %}
 
@@ -6,36 +7,48 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0
 declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
 
-; CHECK-LABEL: test_sin(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sin_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  sin.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  sin.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
+; CHECK-LABEL: test_sin(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sin_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; CHECK-NEXT:    sin.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; CHECK-NEXT:    sin.approx.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_cos(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_cos_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cos.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  cos.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {
+; CHECK-LABEL: test_cos(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_cos_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; CHECK-NEXT:    cos.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; CHECK-NEXT:    cos.approx.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 925ae4245a4c2..e545d4c117791 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
 ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
@@ -5,163 +6,231 @@
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
-; CHECK-LABEL: test_ret_const(
-; CHECK:     mov.b32         [[T:%r[0-9+]]], 1073758080;
-; CHECK:     st.param.b32    [func_retval0], [[T]];
-; CHECK-NEXT: ret;
-
 define <2 x bfloat> @test_ret_const() #0 {
+; CHECK-LABEL: test_ret_const(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 1073758080;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   ret <2 x bfloat> <bfloat 1.0, bfloat 2.0>
 }
 
 ; Check that we can lower fadd with immediate arguments.
-; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fadd_imm_0_param_0];
-;
-; SM90-DAG:        mov.b32        [[I:%r[0-9+]]], 1073758080;
-; SM90-DAG:        add.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[I]];
-;
-; SM80-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; SM80-DAG:  cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]]
-; SM80-DAG:  cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]]
-; SM80-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
-; SM80-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; SM80-DAG:  cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; SM80-DAG:  cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; SM80-DAG:  mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
-
 define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 {
+; SM80-LABEL: test_fadd_imm_0(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<5>;
+; SM80-NEXT:    .reg .b32 %r<3>;
+; SM80-NEXT:    .reg .f32 %f<5>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT:    add.rn.f32 %f2, %f1, 0f40000000;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; SM80-NEXT:    add.rn.f32 %f4, %f3, 0f3F800000;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; SM80-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_fadd_imm_0(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
+; SM90-NEXT:    mov.b32 %r2, 1073758080;
+; SM90-NEXT:    add.rn.bf16x2 %r3, %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    ret;
   %r = fadd <2 x bfloat> <bfloat 1.0, bfloat 2.0>, %a
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fadd_imm_1(
-; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fadd_imm_1_param_0];
-; SM90:       mov.b16         [[B:%rs[0-9]+]], 0x3F80;
-; SM90:       add.rn.bf16     [[R:%rs[0-9]+]], [[A]], [[B]];
-
-; SM80-DAG:   cvt.f32.bf16    [[FA:%f[0-9]+]], [[A]];
-; SM80:       add.rn.f32      [[FR:%f[0-9]+]], [[FA]], 0f3F800000;
-; SM80:       cvt.rn.bf16.f32 [[R:%rs[0-9]+]], [[FR]];
-
-; CHECK:      st.param.b16    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
-
 define bfloat @test_fadd_imm_1(bfloat %a) #0 {
+; SM80-LABEL: test_fadd_imm_1(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<3>;
+; SM80-NEXT:    .reg .f32 %f<3>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
+; SM80-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f2;
+; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_fadd_imm_1(
+; SM90:       {
+; SM90-NEXT:    .reg .b16 %rs<4>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM90-NEXT:    mov.b16 %rs2, 0x3F80;
+; SM90-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-NEXT:    ret;
   %r = fadd bfloat %a, 1.0
   ret bfloat %r
 }
 
-; CHECK-LABEL: test_fsubx2(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fsubx2_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fsubx2_param_1];
-; SM90:       sub.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-
-; SM80-DAG:   mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM80-DAG:   mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]];
-; SM80-DAG:   cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]];
-; SM80-DAG:   cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16    [[FB0:%f[0-9]+]], [[B0]];
-; SM80-DAG:   cvt.f32.bf16    [[FB1:%f[0-9]+]], [[B1]];
-; SM80-DAG:   sub.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; SM80-DAG:   sub.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; SM80-DAG:   cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
-; SM80-DAG:   cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
-; SM80:       mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]};
-
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
-
 define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+; SM80-LABEL: test_fsubx2(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<7>;
+; SM80-NEXT:    .reg .b32 %r<4>;
+; SM80-NEXT:    .reg .f32 %f<7>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
+; SM80-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f2, %rs4;
+; SM80-NEXT:    sub.rn.f32 %f3, %f2, %f1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs5, %f3;
+; SM80-NEXT:    cvt.f32.bf16 %f4, %rs1;
+; SM80-NEXT:    cvt.f32.bf16 %f5, %rs3;
+; SM80-NEXT:    sub.rn.f32 %f6, %f5, %f4;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs6, %f6;
+; SM80-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_fsubx2(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_0];
+; SM90-NEXT:    sub.rn.bf16x2 %r3, %r2, %r1;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    ret;
   %r = fsub <2 x bfloat> %a, %b
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fmulx2(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmulx2_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmulx2_param_1];
-; SM90:       mul.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-
-; SM80-DAG:   mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM80-DAG:   mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]];
-; SM80-DAG:   cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]];
-; SM80-DAG:   cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16    [[FB0:%f[0-9]+]], [[B0]];
-; SM80-DAG:   cvt.f32.bf16    [[FB1:%f[0-9]+]], [[B1]];
-; SM80-DAG:   mul.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; SM80-DAG:   mul.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; SM80-DAG:   cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
-; SM80-DAG:   cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
-; SM80:       mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]};
-
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
-
 define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+; SM80-LABEL: test_fmulx2(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<7>;
+; SM80-NEXT:    .reg .b32 %r<4>;
+; SM80-NEXT:    .reg .f32 %f<7>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
+; SM80-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f2, %rs4;
+; SM80-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs5, %f3;
+; SM80-NEXT:    cvt.f32.bf16 %f4, %rs1;
+; SM80-NEXT:    cvt.f32.bf16 %f5, %rs3;
+; SM80-NEXT:    mul.rn.f32 %f6, %f5, %f4;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs6, %f6;
+; SM80-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_fmulx2(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_0];
+; SM90-NEXT:    mul.rn.bf16x2 %r3, %r2, %r1;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-NEXT:    ret;
   %r = fmul <2 x bfloat> %a, %b
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fdiv(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fdiv_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.bf16     [[FB0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.bf16     [[FB1:%f[0-9]+]], [[B1]];
-; CHECK-DAG:  div.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-DAG:  div.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[FR0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[FR1]];
-; CHECK-NEXT: mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
-
 define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fdiv(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.bf16 %f2, %rs4;
+; CHECK-NEXT:    div.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs5, %f3;
+; CHECK-NEXT:    cvt.f32.bf16 %f4, %rs1;
+; CHECK-NEXT:    cvt.f32.bf16 %f5, %rs3;
+; CHECK-NEXT:    div.rn.f32 %f6, %f5, %f4;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f6;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = fdiv <2 x bfloat> %a, %b
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fneg(
-; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_fneg_param_0];
-
-; CHECK-DAG:        xor.b32        [[IHH0:%r[0-9]+]], [[A]], -2147450880;
-; CHECK-NEXT: st.param.b32    [func_retval0], [[IHH0]];
-; CHECK-NEXT: ret;
 define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
+; CHECK-LABEL: test_fneg(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_fneg_param_0];
+; CHECK-NEXT:    xor.b32 %r2, %r1, -2147450880;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = fneg <2 x bfloat> %a
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: .func test_ldst_v2bf16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v2bf16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v2bf16_param_1];
-; CHECK-DAG:    ld.b32          [[E:%r[0-9]+]], [%[[A]]]
-; CHECK-DAG:    st.b32          [%[[B]]], [[E]];
-; CHECK:        ret;
 define void @test_ldst_v2bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v2bf16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v2bf16_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v2bf16_param_1];
+; CHECK-NEXT:    st.b32 [%rd2], %r1;
+; CHECK-NEXT:    ret;
   %t1 = load <2 x bfloat>, ptr %a
   store <2 x bfloat> %t1, ptr %b, align 16
   ret void
 }
 
-; CHECK-LABEL: .func test_ldst_v3bf16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v3bf16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v3bf16_param_1];
-; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
-;    number of bitshifting instructions that may change at llvm's whim.
-;    So we only verify that we only issue correct number of writes using
-;    correct offset, but not the values we write.
-; CHECK-DAG:    ld.u64
-; CHECK-DAG:    st.u32          [%[[B]]],
-; CHECK-DAG:    st.b16          [%[[B]]+4],
-; CHECK:        ret;
 define void @test_ldst_v3bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v3bf16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3bf16_param_0];
+; CHECK-NEXT:    ld.u64 %rd2, [%rd1];
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; }
+; CHECK-NEXT:    ld.param.u64 %rd3, [test_ldst_v3bf16_param_1];
+; CHECK-NEXT:    st.u32 [%rd3], %rd2;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
+; CHECK-NEXT:    st.b16 [%rd3+4], %rs1;
+; CHECK-NEXT:    ret;
   %t1 = load <3 x bfloat>, ptr %a
   store <3 x bfloat> %t1, ptr %b, align 16
   ret void
@@ -169,161 +238,241 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) {
 
 declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0
 
-; CHECK-LABEL: test_call(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_call_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_call_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .align 4 .b8 param0[4];
-; CHECK-DAG:  .param .align 4 .b8 param1[4];
-; CHECK-DAG:  st.param.b32    [param0], [[A]];
-; CHECK-DAG:  st.param.b32    [param1], [[B]];
-; CHECK-DAG:  .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
-
 define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+; CHECK-LABEL: test_call(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
+; CHECK-NEXT:    .param .align 4 .b8 param1[4];
+; CHECK-NEXT:    st.param.b32 [param1], %r2;
+; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    test_callee,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_select(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_param_1];
-; CHECK-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
-; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
-
 define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c) #0 {
+; CHECK-LABEL: test_select(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_param_0];
+; CHECK-NEXT:    selp.b32 %r3, %r2, %r1, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_select_cc(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_param_3];
-;
-; SM90:  setp.neu.bf16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-;
-; SM80-DAG: mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; SM80-DAG: mov.b32        {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
-; SM80-DAG: cvt.f32.bf16 [[DF0:%f[0-9]+]], [[D0]];
-; SM80-DAG: cvt.f32.bf16 [[CF0:%f[0-9]+]], [[C0]];
-; SM80-DAG: cvt.f32.bf16 [[DF1:%f[0-9]+]], [[D1]];
-; SM80-DAG: cvt.f32.bf16 [[CF1:%f[0-9]+]], [[C1]];
-; SM80-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; SM80-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
-
 define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c, <2 x bfloat> %d) #0 {
+; SM80-LABEL: test_select_cc(
+; SM80:       {
+; SM80-NEXT:    .reg .pred %p<3>;
+; SM80-NEXT:    .reg .b16 %rs<11>;
+; SM80-NEXT:    .reg .b32 %r<6>;
+; SM80-NEXT:    .reg .f32 %f<5>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; SM80-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; SM80-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; SM80-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
+; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; SM80-NEXT:    cvt.f32.bf16 %f2, %rs3;
+; SM80-NEXT:    setp.neu.f32 %p1, %f2, %f1;
+; SM80-NEXT:    cvt.f32.bf16 %f3, %rs2;
+; SM80-NEXT:    cvt.f32.bf16 %f4, %rs4;
+; SM80-NEXT:    setp.neu.f32 %p2, %f4, %f3;
+; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
+; SM80-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; SM80-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
+; SM80-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
+; SM80-NEXT:    mov.b32 %r5, {%rs10, %rs9};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r5;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_select_cc(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<7>;
+; SM90-NEXT:    .reg .b32 %r<6>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; SM90-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; SM90-NEXT:    ld.param.b32 %r3, [test_select_cc_param_3];
+; SM90-NEXT:    ld.param.b32 %r4, [test_select_cc_param_2];
+; SM90-NEXT:    setp.neu.bf16x2 %p1|%p2, %r4, %r3;
+; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM90-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; SM90-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
+; SM90-NEXT:    mov.b32 %r5, {%rs6, %rs5};
+; SM90-NEXT:    st.param.b32 [func_retval0], %r5;
+; SM90-NEXT:    ret;
   %cc = fcmp une <2 x bfloat> %c, %d
   %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %r
 }
 
-
-; CHECK-LABEL: test_select_cc_f32_bf16(
-; CHECK-DAG:  ld.param.v2.f32    {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_bf16_param_0];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_f32_bf16_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_f32_bf16_param_3];
-; SM90:  setp.neu.bf16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-; CHECK-DAG:  ld.param.v2.f32    {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_bf16_param_1];
-
-; SM80-DAG: mov.b32         {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; SM80-DAG: mov.b32         {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
-; SM80-DAG: cvt.f32.bf16 [[DF0:%f[0-9]+]], [[D0]];
-; SM80-DAG: cvt.f32.bf16 [[CF0:%f[0-9]+]], [[C0]];
-; SM80-DAG: cvt.f32.bf16 [[DF1:%f[0-9]+]], [[D1]];
-; SM80-DAG: cvt.f32.bf16 [[CF1:%f[0-9]+]], [[C1]];
-; SM80-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; SM80-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG: selp.f32        [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG: selp.f32        [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK-NEXT: st.param.v2.f32    [func_retval0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
 define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
+; SM80-LABEL: test_select_cc_f32_bf16(
+; SM80:       {
+; SM80-NEXT:    .reg .pred %p<3>;
+; SM80-NEXT:    .reg .b16 %rs<5>;
+; SM80-NEXT:    .reg .b32 %r<3>;
+; SM80-NEXT:    .reg .f32 %f<11>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
+; SM80-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_bf16_param_2];
+; SM80-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_bf16_param_3];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f4, %rs3;
+; SM80-NEXT:    setp.neu.f32 %p1, %f4, %f3;
+; SM80-NEXT:    cvt.f32.bf16 %f5, %rs2;
+; SM80-NEXT:    cvt.f32.bf16 %f6, %rs4;
+; SM80-NEXT:    setp.neu.f32 %p2, %f6, %f5;
+; SM80-NEXT:    ld.param.v2.f32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1];
+; SM80-NEXT:    selp.f32 %f9, %f2, %f8, %p2;
+; SM80-NEXT:    selp.f32 %f10, %f1, %f7, %p1;
+; SM80-NEXT:    st.param.v2.f32 [func_retval0], {%f10, %f9};
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_select_cc_f32_bf16(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b32 %r<3>;
+; SM90-NEXT:    .reg .f32 %f<7>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_bf16_param_3];
+; SM90-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_bf16_param_2];
+; SM90-NEXT:    setp.neu.bf16x2 %p1|%p2, %r2, %r1;
+; SM90-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1];
+; SM90-NEXT:    selp.f32 %f5, %f2, %f4, %p2;
+; SM90-NEXT:    selp.f32 %f6, %f1, %f3, %p1;
+; SM90-NEXT:    st.param.v2.f32 [func_retval0], {%f6, %f5};
+; SM90-NEXT:    ret;
                                            <2 x bfloat> %c, <2 x bfloat> %d) #0 {
   %cc = fcmp une <2 x bfloat> %c, %d
   %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
   ret <2 x float> %r
 }
 
-; CHECK-LABEL: test_select_cc_bf16_f32(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_bf16_f32_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_bf16_f32_param_1];
-; CHECK-DAG:  ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_bf16_f32_param_2];
-; CHECK-DAG:  ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_bf16_f32_param_3];
-; CHECK-DAG:  setp.neu.f32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
-; CHECK-DAG:  setp.neu.f32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
+; CHECK-LABEL: test_select_cc_bf16_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1];
+; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2];
+; CHECK-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3];
+; CHECK-NEXT:    setp.neu.f32 %p1, %f1, %f3;
+; CHECK-NEXT:    setp.neu.f32 %p2, %f2, %f4;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; CHECK-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
                                           <2 x float> %c, <2 x float> %d) #0 {
   %cc = fcmp une <2 x float> %c, %d
   %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fptrunc_2xfloat(
-; CHECK:      ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptrunc_2xfloat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs1, %f2;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
+; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x bfloat>
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fpext_2xfloat(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fpext_2xfloat_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.bf16     [[R0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[R1:%f[0-9]+]], [[A1]];
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]};
-; CHECK:      ret;
 define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 {
+; CHECK-LABEL: test_fpext_2xfloat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; CHECK-NEXT:    cvt.f32.bf16 %f2, %rs1;
+; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT:    ret;
   %r = fpext <2 x bfloat> %a to <2 x float>
   ret <2 x float> %r
 }
 
-; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_bitcast_2xbf16_to_2xi16_param_0];
-; CHECK:      st.param.b32 [func_retval0], [[A]]
-; CHECK:      ret;
 define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = bitcast <2 x bfloat> %a to <2 x i16>
   ret <2 x i16> %r
 }
 
-
-; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16(
-; CHECK:      ld.param.b32     [[R]], [test_bitcast_2xi16_to_2xbf16_param_0];
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_bitcast_2xi16_to_2xbf16(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xi16_to_2xbf16_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = bitcast <2 x i16> %a to <2 x bfloat>
   ret <2 x bfloat> %r
 }
@@ -351,184 +500,374 @@ declare <2 x bfloat> @llvm.nearbyint.f16(<2 x bfloat> %a) #0
 declare <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) #0
 declare <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0
 
-
-; CHECK-LABEL: test_sqrt(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sqrt_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  sqrt.rn.f32     [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  sqrt.rn.f32     [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 {
+; CHECK-LABEL: test_sqrt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; CHECK-NEXT:    sqrt.rn.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_fmuladd_param_2];
-;
-; CHECK:       fma.rn.bf16x2   [[RA:%r[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NEXT: st.param.b32    [func_retval0], [[RA]];
-; CHECK:      ret;
 define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
+; CHECK-LABEL: test_fmuladd(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_0];
+; CHECK-NEXT:    fma.rn.bf16x2 %r4, %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK:      and.b32         [[R:%r[0-9]+]], [[A]], 2147450879;
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
+; CHECK-LABEL: test_fabs(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_fabs_param_0];
+; CHECK-NEXT:    and.b32 %r2, %r1, 2147450879;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_fabs_add(
-; CHECK:      abs.bf16x2
-; CHECK:      ret;
 define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+; SM80-LABEL: test_fabs_add(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<11>;
+; SM80-NEXT:    .reg .b32 %r<6>;
+; SM80-NEXT:    .reg .f32 %f<11>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_fabs_add_param_1];
+; SM80-NEXT:    ld.param.b32 %r2, [test_fabs_add_param_0];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT:    add.rn.f32 %f2, %f1, %f1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; SM80-NEXT:    add.rn.f32 %f4, %f3, %f3;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; SM80-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; SM80-NEXT:    abs.bf16x2 %r4, %r3;
+; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r4;
+; SM80-NEXT:    cvt.f32.bf16 %f5, %rs6;
+; SM80-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f6, %rs8;
+; SM80-NEXT:    add.rn.f32 %f7, %f5, %f6;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs9, %f7;
+; SM80-NEXT:    cvt.f32.bf16 %f8, %rs5;
+; SM80-NEXT:    cvt.f32.bf16 %f9, %rs7;
+; SM80-NEXT:    add.rn.f32 %f10, %f8, %f9;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs10, %f10;
+; SM80-NEXT:    mov.b32 %r5, {%rs10, %rs9};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r5;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_fabs_add(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<6>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_fabs_add_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [test_fabs_add_param_0];
+; SM90-NEXT:    add.rn.bf16x2 %r3, %r2, %r2;
+; SM90-NEXT:    abs.bf16x2 %r4, %r3;
+; SM90-NEXT:    add.rn.bf16x2 %r5, %r4, %r1;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r5;
+; SM90-NEXT:    ret;
   %s = fadd <2 x bfloat> %a, %a
   %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %s)
   %d = fadd <2 x bfloat> %r, %b
   ret <2 x bfloat> %d
 }
 
-
-; CHECK-LABEL: test_minnum(
-; CHECK-DAG:  ld.param.b32    [[AF0:%r[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[BF0:%r[0-9]+]], [test_minnum_param_1];
-; CHECK-DAG:  min.bf16x2         [[RF0:%r[0-9]+]], [[AF0]], [[BF0]];
-; CHECK:      st.param.b32    [func_retval0], [[RF0]];
-; CHECK:      ret;
 define <2 x bfloat> @test_minnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+; CHECK-LABEL: test_minnum(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_minnum_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_minnum_param_0];
+; CHECK-NEXT:    min.bf16x2 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_maxnum(
-; CHECK-DAG:  ld.param.b32    [[AF0:%r[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[BF0:%r[0-9]+]], [test_maxnum_param_1];
-; CHECK-DAG:  max.bf16x2         [[RF0:%r[0-9]+]], [[AF0]], [[BF0]];
-; CHECK:      st.param.b32    [func_retval0], [[RF0]];
-; CHECK:      ret;
 define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+; CHECK-LABEL: test_maxnum(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_maxnum_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_maxnum_param_0];
+; CHECK-NEXT:    max.bf16x2 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %r
 }
 
-
-
-; CHECK-LABEL: test_floor(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_floor_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM90:  cvt.rmi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
-; SM90:  cvt.rmi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
-; SM80-DAG:  cvt.rmi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]];
-; SM80-DAG:  cvt.rmi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]];
-; SM80-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; SM80-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 {
+; SM80-LABEL: test_floor(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<5>;
+; SM80-NEXT:    .reg .b32 %r<3>;
+; SM80-NEXT:    .reg .f32 %f<5>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT:    cvt.rmi.f32.f32 %f2, %f1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; SM80-NEXT:    cvt.rmi.f32.f32 %f4, %f3;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; SM80-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_floor(
+; SM90:       {
+; SM90-NEXT:    .reg .b16 %rs<5>;
+; SM90-NEXT:    .reg .b32 %r<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
+; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM90-NEXT:    cvt.rmi.bf16.bf16 %rs3, %rs2;
+; SM90-NEXT:    cvt.rmi.bf16.bf16 %rs4, %rs1;
+; SM90-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_ceil(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_ceil_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM90:  cvt.rpi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
-; SM90:  cvt.rpi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
-; SM80-DAG:   cvt.rpi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]];
-; SM80-DAG:   cvt.rpi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]];
-; SM80-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; SM80-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 {
+; SM80-LABEL: test_ceil(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<5>;
+; SM80-NEXT:    .reg .b32 %r<3>;
+; SM80-NEXT:    .reg .f32 %f<5>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT:    cvt.rpi.f32.f32 %f2, %f1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; SM80-NEXT:    cvt.rpi.f32.f32 %f4, %f3;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; SM80-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_ceil(
+; SM90:       {
+; SM90-NEXT:    .reg .b16 %rs<5>;
+; SM90-NEXT:    .reg .b32 %r<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
+; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM90-NEXT:    cvt.rpi.bf16.bf16 %rs3, %rs2;
+; SM90-NEXT:    cvt.rpi.bf16.bf16 %rs4, %rs1;
+; SM90-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_trunc(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_trunc_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM90:  cvt.rzi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
-; SM90:  cvt.rzi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 {
+; SM80-LABEL: test_trunc(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<5>;
+; SM80-NEXT:    .reg .b32 %r<3>;
+; SM80-NEXT:    .reg .f32 %f<5>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT:    cvt.rzi.f32.f32 %f2, %f1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; SM80-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; SM80-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_trunc(
+; SM90:       {
+; SM90-NEXT:    .reg .b16 %rs<5>;
+; SM90-NEXT:    .reg .b32 %r<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
+; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM90-NEXT:    cvt.rzi.bf16.bf16 %rs3, %rs2;
+; SM90-NEXT:    cvt.rzi.bf16.bf16 %rs4, %rs1;
+; SM90-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_rint(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_rint_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM90:  cvt.rni.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
-; SM90:  cvt.rni.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 {
+; SM80-LABEL: test_rint(
+; SM80:       {
+; SM80-NEXT:    .reg .b16 %rs<5>;
+; SM80-NEXT:    .reg .b32 %r<3>;
+; SM80-NEXT:    .reg .f32 %f<5>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT:    cvt.rni.f32.f32 %f2, %f1;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
+; SM80-NEXT:    cvt.f32.bf16 %f3, %rs1;
+; SM80-NEXT:    cvt.rni.f32.f32 %f4, %f3;
+; SM80-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; SM80-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_rint(
+; SM90:       {
+; SM90-NEXT:    .reg .b16 %rs<5>;
+; SM90-NEXT:    .reg .b32 %r<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
+; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; SM90-NEXT:    cvt.rni.bf16.bf16 %rs3, %rs2;
+; SM90-NEXT:    cvt.rni.bf16.bf16 %rs4, %rs1;
+; SM90-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_round(
-; CHECK:      ld.param.b32    {{.*}}, [test_round_param_0];
-; check the use of sign mask and 0.5 to implement round
-; CHECK:      and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK:      or.b32 {{.*}}, [[R1]], 1056964608;
-; CHECK:      and.b32 [[R2:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK:      or.b32 {{.*}}, [[R2]], 1056964608;
-; CHECK:      st.param.b32    [func_retval0], {{.*}};
-; CHECK:      ret;
 define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
+; CHECK-LABEL: test_round(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<17>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_round_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; CHECK-NEXT:    mov.b32 %r2, %f1;
+; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
+; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
+; CHECK-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
+; CHECK-NEXT:    abs.f32 %f5, %f1;
+; CHECK-NEXT:    setp.gt.f32 %p1, %f5, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
+; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f8;
+; CHECK-NEXT:    cvt.f32.bf16 %f9, %rs1;
+; CHECK-NEXT:    mov.b32 %r5, %f9;
+; CHECK-NEXT:    and.b32 %r6, %r5, -2147483648;
+; CHECK-NEXT:    or.b32 %r7, %r6, 1056964608;
+; CHECK-NEXT:    mov.b32 %f10, %r7;
+; CHECK-NEXT:    add.rn.f32 %f11, %f9, %f10;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f12, %f11;
+; CHECK-NEXT:    abs.f32 %f13, %f9;
+; CHECK-NEXT:    setp.gt.f32 %p3, %f13, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %f14, %f9, %f12, %p3;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f15, %f9;
+; CHECK-NEXT:    setp.lt.f32 %p4, %f13, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %f16, %f15, %f14, %p4;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f16;
+; CHECK-NEXT:    mov.b32 %r8, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
-; CHECK-LABEL: test_copysign(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_param_1];
-; SM80-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; SM80-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; SM80-DAG:  abs.bf16        [[AW1:%rs[0-9]+]], [[A1]];
-; SM80-DAG:  neg.bf16        [[AY1:%rs[0-9]+]], [[AW1]];
-; SM80-DAG:  shr.u16         [[BS1:%rs[0-9]+]], [[B1]], 15;
-; SM80-DAG:  and.b16         [[BR1:%rs[0-9]+]], [[BS1]], 1;
-; SM80-DAG:  setp.eq.b16     [[P1:%p[0-9]+]], [[BR1]], 1;
-; SM80-DAG:  selp.b16        [[RS1:%rs[0-9]+]], [[AY1]], [[AW1]], [[P1]]
-; SM80-DAG:  abs.bf16        [[AW0:%rs[0-9]+]], [[A0]];
-; SM80-DAG:  neg.bf16        [[AY0:%rs[0-9]+]], [[AW0]];
-; SM80-DAG:  shr.u16         [[BS0:%rs[0-9]+]], [[B0]], 15;
-; SM80-DAG:  and.b16         [[BR0:%rs[0-9]+]], [[BS0]], 1;
-; SM80-DAG:  setp.eq.b16     [[P0:%p[0-9]+]], [[BR0]], 1;
-; SM80-DAG:  selp.b16        [[RS0:%rs[0-9]+]], [[AY0]], [[AW0]], [[P0]]
-; SM80-DAG:  mov.b32         [[R:%r[0-9]+]], {[[RS0]], [[RS1]]}
-; SM90-DAG:  and.b32         [[R1:%r[0-9]+]], [[B]], -2147450880;
-; SM90-DAG:  and.b32         [[R2:%r[0-9]+]], [[A]], 2147450879;
-; SM90-DAG:  or.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+; SM80-LABEL: test_copysign(
+; SM80:       {
+; SM80-NEXT:    .reg .pred %p<3>;
+; SM80-NEXT:    .reg .b16 %rs<17>;
+; SM80-NEXT:    .reg .b32 %r<4>;
+; SM80-EMPTY:
+; SM80-NEXT:  // %bb.0:
+; SM80-NEXT:    ld.param.b32 %r1, [test_copysign_param_1];
+; SM80-NEXT:    ld.param.b32 %r2, [test_copysign_param_0];
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM80-NEXT:    abs.bf16 %rs3, %rs2;
+; SM80-NEXT:    neg.bf16 %rs4, %rs3;
+; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; SM80-NEXT:    shr.u16 %rs8, %rs6, 15;
+; SM80-NEXT:    and.b16 %rs9, %rs8, 1;
+; SM80-NEXT:    setp.eq.b16 %p1, %rs9, 1;
+; SM80-NEXT:    selp.b16 %rs10, %rs4, %rs3, %p1;
+; SM80-NEXT:    abs.bf16 %rs11, %rs1;
+; SM80-NEXT:    neg.bf16 %rs12, %rs11;
+; SM80-NEXT:    shr.u16 %rs14, %rs5, 15;
+; SM80-NEXT:    and.b16 %rs15, %rs14, 1;
+; SM80-NEXT:    setp.eq.b16 %p2, %rs15, 1;
+; SM80-NEXT:    selp.b16 %rs16, %rs12, %rs11, %p2;
+; SM80-NEXT:    mov.b32 %r3, {%rs16, %rs10};
+; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM80-NEXT:    ret;
+;
+; SM90-LABEL: test_copysign(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<9>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
+; SM90-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
+; SM90-NEXT:    and.b32 %r4, %r2, -2147450880;
+; SM90-NEXT:    and.b32 %r6, %r1, 2147450879;
+; SM90-NEXT:    or.b32 %r7, %r6, %r4;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r7;
+; SM90-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %r
 }
diff --git a/llvm/test/CodeGen/NVPTX/f16-abs.ll b/llvm/test/CodeGen/NVPTX/f16-abs.ll
new file mode 100644
index 0000000000000..d12653e813bd1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f16-abs.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; ## FP16 abs is not supported by PTX version (PTX < 65).
+; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx60                                      \
+; RUN:          -O0 -disable-post-ra -verify-machineinstrs                     \
+; RUN: | FileCheck -check-prefix CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                            \
+; RUN:   llc < %s -mcpu=sm_53 -mattr=+ptx60                                    \
+; RUN:            -O0 -disable-post-ra -verify-machineinstrs                   \
+; RUN:   | %ptxas-verify -arch=sm_53                                           \
+; RUN: %}
+
+; ## FP16 support explicitly disabled (--nvptx-no-f16-math).
+; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65 --nvptx-no-f16-math                  \
+; RUN:          -O0 -disable-post-ra -verify-machineinstrs                     \
+; RUN: | FileCheck -check-prefix CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                            \
+; RUN:   llc < %s -mcpu=sm_53 -mattr=+ptx65 --nvptx-no-f16-math                \
+; RUN:            -O0 -disable-post-ra -verify-machineinstrs                   \
+; RUN:   | %ptxas-verify -arch=sm_53                                           \
+; RUN: %}
+
+; ## FP16 is not supported by hardware (SM < 53).
+; RUN: llc < %s -mcpu=sm_52 -mattr=+ptx65                                      \
+; RUN:          -O0 -disable-post-ra -verify-machineinstrs                     \
+; RUN: | FileCheck -check-prefix CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                            \
+; RUN:   llc < %s -mcpu=sm_52 -mattr=+ptx65                                    \
+; RUN:          -O0 -disable-post-ra -verify-machineinstrs                     \
+; RUN:   | %ptxas-verify -arch=sm_52                                           \
+; RUN: %}
+
+; ## Full FP16 abs support.
+; RUN: llc < %s -mcpu=sm_53 -mattr=+ptx65                                      \
+; RUN:          -O0 -disable-post-ra -verify-machineinstrs                     \
+; RUN: | FileCheck -check-prefix CHECK-F16-ABS %s
+; RUN: %if ptxas %{                                                            \
+; RUN:   llc < %s -mcpu=sm_53 -mattr=+ptx65                                    \
+; RUN:          -O0 -disable-post-ra -verify-machineinstrs                     \
+; RUN:   | %ptxas-verify -arch=sm_53                                           \
+; RUN: %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare half @llvm.fabs.f16(half %a)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+
+define half @test_fabs(half %a) {
+; CHECK-NOF16-LABEL: test_fabs(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [test_fabs_param_0];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NOF16-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-ABS-LABEL: test_fabs(
+; CHECK-F16-ABS:       {
+; CHECK-F16-ABS-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-ABS-EMPTY:
+; CHECK-F16-ABS-NEXT:  // %bb.0:
+; CHECK-F16-ABS-NEXT:    ld.param.b16 %rs1, [test_fabs_param_0];
+; CHECK-F16-ABS-NEXT:    abs.f16 %rs2, %rs1;
+; CHECK-F16-ABS-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-F16-ABS-NEXT:    ret;
+  %r = call half @llvm.fabs.f16(half %a)
+  ret half %r
+}
+
+define <2 x half> @test_fabs_2(<2 x half> %a) #0 {
+; CHECK-F16-LABEL: test_fabs_2(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<5>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fabs_2_param_0];
+; CHECK-F16-NEXT:    and.b32 %r3, %r1, 2147450879;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-F16-ABS-LABEL: test_fabs_2(
+; CHECK-F16-ABS:       {
+; CHECK-F16-ABS-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-ABS-EMPTY:
+; CHECK-F16-ABS-NEXT:  // %bb.0:
+; CHECK-F16-ABS-NEXT:    ld.param.b32 %r1, [test_fabs_2_param_0];
+; CHECK-F16-ABS-NEXT:    abs.f16x2 %r2, %r1;
+; CHECK-F16-ABS-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-F16-ABS-NEXT:    ret;
+  %r = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index b11c69e064c4a..eb0b00e883846 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1,325 +1,459 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; ## Full FP16 support enabled by default.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-F16 %s
 ; RUN: %if ptxas %{                                                           \
-; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN:   | %ptxas-verify -arch=sm_53                                          \
 ; RUN: %}
 ; ## FP16 support explicitly disabled.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
 ; RUN:           -verify-machineinstrs \
 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s
 ; RUN: %if ptxas %{                                                           \
-; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math   \
 ; RUN:           -verify-machineinstrs                                        \
 ; RUN:   | %ptxas-verify -arch=sm_53                                          \
 ; RUN: %}
 ; ## FP16 is not supported by hardware.
-; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \
 ; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOF16 %s
 ; RUN: %if ptxas %{                                                               \
-; RUN:   llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN:   llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \
 ; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs         \
 ; RUN:   | %ptxas-verify -arch=sm_52                                              \
 ; RUN: %}
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
-; CHECK-LABEL: test_ret_const(
-; CHECK:     mov.b32         [[R:%r[0-9+]]], 1073757184;
-; CHECK:     st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_ret_const() #0 {
+; CHECK-LABEL: test_ret_const(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 1073757184;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   ret <2 x half> <half 1.0, half 2.0>
 }
 
-; CHECK-LABEL: test_extract_0(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_extract_0_param_0];
-; CHECK:      mov.b32         {[[R:%rs[0-9]+]], tmp}, [[A]];
-; CHECK:      st.param.b16    [func_retval0], [[R]];
-; CHECK:      ret;
 define half @test_extract_0(<2 x half> %a) #0 {
+; CHECK-LABEL: test_extract_0(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i32 0
   ret half %e
 }
 
-; CHECK-LABEL: test_extract_1(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_extract_1_param_0];
-; CHECK:      mov.b32         {tmp, [[R:%rs[0-9]+]]}, [[A]];
-; CHECK:      st.param.b16    [func_retval0], [[R]];
-; CHECK:      ret;
 define half @test_extract_1(<2 x half> %a) #0 {
+; CHECK-LABEL: test_extract_1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i32 1
   ret half %e
 }
 
-; CHECK-LABEL: test_extract_i(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_extract_i_param_0];
-; CHECK-DAG:  ld.param.u64    [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
-; CHECK-DAG:  setp.eq.s64     [[PRED:%p[0-9]+]], [[IDX]], 0;
-; CHECK-DAG:  mov.b32         {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]];
-; CHECK:      selp.b16        [[R:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]];
-; CHECK:      st.param.b16    [func_retval0], [[R]];
-; CHECK:      ret;
 define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
+; CHECK-LABEL: test_extract_i(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_extract_i_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
+; CHECK-NEXT:    setp.eq.s64 %p1, %rd1, 0;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i64 %idx
   ret half %e
 }
 
-; CHECK-LABEL: test_fadd(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fadd_param_1];
-;
-; CHECK-F16-NEXT:   add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fadd(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fadd_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fadd_param_0];
+; CHECK-F16-NEXT:    add.rn.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fadd(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fadd_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    ret;
   %r = fadd <2 x half> %a, %b
   ret <2 x half> %r
 }
 
 ; Check that we can lower fadd with immediate arguments.
-; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fadd_imm_0_param_0];
-;
-; CHECK-F16:        mov.b32        [[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:        add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[I]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
+; CHECK-F16-LABEL: test_fadd_imm_0(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
+; CHECK-F16-NEXT:    mov.b32 %r2, 1073757184;
+; CHECK-F16-NEXT:    add.rn.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fadd_imm_0(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f2, %f1, 0f40000000;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f4, %f3, 0f3F800000;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    ret;
   %r = fadd <2 x half> <half 1.0, half 2.0>, %a
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fadd_imm_1(
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fadd_imm_1_param_0];
-;
-; CHECK-F16:        mov.b32        [[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:        add.rn.f16x2   [[R:%r[0-9]+]], [[B]], [[I]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
+; CHECK-F16-LABEL: test_fadd_imm_1(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_1_param_0];
+; CHECK-F16-NEXT:    mov.b32 %r2, 1073757184;
+; CHECK-F16-NEXT:    add.rn.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fadd_imm_1(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_1_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f2, %f1, 0f40000000;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f4, %f3, 0f3F800000;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    ret;
   %r = fadd <2 x half> %a, <half 1.0, half 2.0>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fsub(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fsub_param_0];
-;
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fsub_param_1];
-; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fsub(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fsub_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fsub_param_0];
+; CHECK-F16-NEXT:    sub.rn.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fsub(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fsub_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fsub_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    sub.rn.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    sub.rn.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    ret;
   %r = fsub <2 x half> %a, %b
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fneg(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fneg_param_0];
-;
-; CHECK-F16:        mov.b32        [[I:%r[0-9+]]], 0;
-; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%r[0-9]+]], [[I]], [[A]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  mov.f32        [[Z:%f[0-9]+]], 0f00000000;
-; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
-; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_fneg(<2 x half> %a) #0 {
+; CHECK-F16-LABEL: test_fneg(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
+; CHECK-F16-NEXT:    mov.b32 %r2, 0;
+; CHECK-F16-NEXT:    sub.rn.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fneg(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<6>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.f32 %f2, 0f00000000;
+; CHECK-NOF16-NEXT:    sub.rn.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    sub.rn.f32 %f5, %f2, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %f5;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    ret;
   %r = fsub <2 x half> <half 0.0, half 0.0>, %a
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fmul(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmul_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmul_param_1];
-; CHECK-F16-NEXT: mul.rn.f16x2     [[R:%r[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  mul.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG:  mul.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fmul(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fmul_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fmul_param_0];
+; CHECK-F16-NEXT:    mul.rn.f16x2 %r3, %r1, %r2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fmul(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmul_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmul_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    mul.rn.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    mul.rn.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    ret;
   %r = fmul <2 x half> %a, %b
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fdiv(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fdiv_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  cvt.f32.f16     [[FA0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[FA1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.f16     [[FB0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.f16     [[FB1:%f[0-9]+]], [[B1]];
-; CHECK-DAG:  div.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-DAG:  div.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[FR0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[FR1]];
-; CHECK-NEXT: mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-LABEL: test_fdiv(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NEXT:    div.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NEXT:    div.rn.f32 %f6, %f5, %f4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = fdiv <2 x half> %a, %b
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_frem(
 ; -- Load two 16x2 inputs and split them into f16 elements
-; CHECK-DAG:  ld.param.b32       [[A:%r[0-9]+]], [test_frem_param_0];
-; CHECK-DAG:  ld.param.b32       [[B:%r[0-9]+]], [test_frem_param_1];
 ; -- Split into elements
-; CHECK-DAG:  mov.b32            {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32            {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
 ; -- promote to f32.
-; CHECK-DAG:  cvt.f32.f16        [[FA0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16        [[FB0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.f16        [[FA1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.f16        [[FB1:%f[0-9]+]], [[B1]];
 ; -- frem(a[0],b[0]).
-; CHECK-DAG:  div.rn.f32         [[FD0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-DAG:  cvt.rzi.f32.f32    [[DI0:%f[0-9]+]], [[FD0]];
-; CHECK-DAG:  mul.f32            [[RI0:%f[0-9]+]], [[DI0]], [[FB0]];
-; CHECK-DAG:  sub.f32            [[RFNINF0:%f[0-9]+]], [[FA0]], [[RI0]];
-; CHECK-DAG:  testp.infinite.f32 [[ISB0INF:%p[0-9]+]], [[FB0]];
-; CHECK-DAG:  selp.f32           [[RF0:%f[0-9]+]], [[FA0]], [[RFNINF0]], [[ISB0INF]];
 ; -- frem(a[1],b[1]).
-; CHECK-DAG:  div.rn.f32         [[FD1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG:  cvt.rzi.f32.f32    [[DI1:%f[0-9]+]], [[FD1]];
-; CHECK-DAG:  mul.f32            [[RI1:%f[0-9]+]], [[DI1]], [[FB1]];
-; CHECK-DAG:  sub.f32            [[RFNINF1:%f[0-9]+]], [[FA1]], [[RI1]];
-; CHECK-DAG:  testp.infinite.f32 [[ISB1INF:%p[0-9]+]], [[FB1]];
-; CHECK-DAG:  selp.f32           [[RF1:%f[0-9]+]], [[FA1]], [[RFNINF1]], [[ISB1INF]];
 ; -- convert back to f16.
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
 ; -- merge into f16x2 and return it.
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-LABEL: test_frem(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r2, [test_frem_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_frem_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NEXT:    div.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
+; CHECK-NEXT:    mul.f32 %f5, %f4, %f1;
+; CHECK-NEXT:    sub.f32 %f6, %f2, %f5;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %f1;
+; CHECK-NEXT:    selp.f32 %f7, %f2, %f6, %p1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f7;
+; CHECK-NEXT:    cvt.f32.f16 %f8, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %f9, %rs3;
+; CHECK-NEXT:    div.rn.f32 %f10, %f9, %f8;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f11, %f10;
+; CHECK-NEXT:    mul.f32 %f12, %f11, %f8;
+; CHECK-NEXT:    sub.f32 %f13, %f9, %f12;
+; CHECK-NEXT:    testp.infinite.f32 %p2, %f8;
+; CHECK-NEXT:    selp.f32 %f14, %f9, %f13, %p2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f14;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = frem <2 x half> %a, %b
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: .func test_ldst_v2f16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
-; CHECK-DAG:    ld.b32          [[E:%r[0-9]+]], [%[[A]]]
-; CHECK-DAG:    st.b32          [%[[B]]], [[E]];
-; CHECK:        ret;
 define void @test_ldst_v2f16(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v2f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v2f16_param_1];
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v2f16_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NEXT:    st.b32 [%rd2], %r1;
+; CHECK-NEXT:    ret;
   %t1 = load <2 x half>, ptr %a
   store <2 x half> %t1, ptr %b, align 16
   ret void
 }
 
-; CHECK-LABEL: .func test_ldst_v3f16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1];
 ; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
 ;    number of bitshifting instructions that may change at llvm's whim.
 ;    So we only verify that we only issue correct number of writes using
 ;    correct offset, but not the values we write.
-; CHECK-DAG:    ld.u64
-; CHECK-DAG:    st.u32          [%[[B]]],
-; CHECK-DAG:    st.b16          [%[[B]]+4],
-; CHECK:        ret;
 define void @test_ldst_v3f16(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v3f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v3f16_param_1];
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3f16_param_0];
+; CHECK-NEXT:    ld.u64 %rd3, [%rd1];
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd3; }
+; CHECK-NEXT:    st.u32 [%rd2], %rd3;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
+; CHECK-NEXT:    st.b16 [%rd2+4], %rs1;
+; CHECK-NEXT:    ret;
   %t1 = load <3 x half>, ptr %a
   store <3 x half> %t1, ptr %b, align 16
   ret void
 }
 
-; CHECK-LABEL: .func test_ldst_v4f16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
-; CHECK-DAG:    ld.v4.b16       {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [%[[A]]];
-; CHECK-DAG:    st.v4.b16       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:        ret;
 define void @test_ldst_v4f16(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v4f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v4f16_param_1];
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v4f16_param_0];
+; CHECK-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    st.v4.b16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
+; CHECK-NEXT:    ret;
   %t1 = load <4 x half>, ptr %a
   store <4 x half> %t1, ptr %b, align 16
   ret void
 }
 
-; CHECK-LABEL: .func test_ldst_v8f16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1];
-; CHECK-DAG:    ld.v4.b32       {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
-; CHECK-DAG:    st.v4.b32       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:        ret;
 define void @test_ldst_v8f16(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v8f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v8f16_param_1];
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v8f16_param_0];
+; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ret;
   %t1 = load <8 x half>, ptr %a
   store <8 x half> %t1, ptr %b, align 16
   ret void
@@ -327,704 +461,1210 @@ define void @test_ldst_v8f16(ptr %a, ptr %b) {
 
 declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
 
-; CHECK-LABEL: test_call(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_call_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_call_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .align 4 .b8 param0[4];
-; CHECK-DAG:  .param .align 4 .b8 param1[4];
-; CHECK-DAG:  st.param.b32    [param0], [[A]];
-; CHECK-DAG:  st.param.b32    [param1], [[B]];
-; CHECK-DAG:  .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-LABEL: test_call(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
+; CHECK-NEXT:    .param .align 4 .b8 param1[4];
+; CHECK-NEXT:    st.param.b32 [param1], %r2;
+; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    test_callee,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_call_flipped(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_call_flipped_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_call_flipped_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .align 4 .b8 param0[4];
-; CHECK-DAG:  .param .align 4 .b8 param1[4];
-; CHECK-DAG:  st.param.b32    [param0], [[B]];
-; CHECK-DAG:  st.param.b32    [param1], [[A]];
-; CHECK-DAG:  .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-LABEL: test_call_flipped(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.b32 [param0], %r2;
+; CHECK-NEXT:    .param .align 4 .b8 param1[4];
+; CHECK-NEXT:    st.param.b32 [param1], %r1;
+; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    test_callee,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_tailcall_flipped_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_tailcall_flipped_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .align 4 .b8 param0[4];
-; CHECK-DAG:  .param .align 4 .b8 param1[4];
-; CHECK-DAG:  st.param.b32    [param0], [[B]];
-; CHECK-DAG:  st.param.b32    [param1], [[A]];
-; CHECK-DAG:  .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
+; CHECK-NEXT:    { // callseq 2, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    st.param.b32 [param0], %r2;
+; CHECK-NEXT:    .param .align 4 .b8 param1[4];
+; CHECK-NEXT:    st.param.b32 [param1], %r1;
+; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    test_callee,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
+; CHECK-NEXT:    } // callseq 2
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_select(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_param_1];
-; CHECK-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
-; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
+; CHECK-LABEL: test_select(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_param_0];
+; CHECK-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = select i1 %c, <2 x half> %a, <2 x half> %b
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_select_cc(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_param_3];
-;
-; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-;
-; CHECK-NOF16-DAG: mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: mov.b32        {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
-; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
+; CHECK-F16-LABEL: test_select_cc(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-F16-NEXT:    .reg .b32 %r<6>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r3, %r4;
+; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-F16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-F16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; CHECK-F16-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
+; CHECK-F16-NEXT:    mov.b32 %r5, {%rs6, %rs5};
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_select_cc(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<11>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<6>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
+; CHECK-NOF16-NEXT:    mov.b32 %r5, {%rs10, %rs9};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NOF16-NEXT:    ret;
   %cc = fcmp une <2 x half> %c, %d
   %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_select_cc_f32_f16(
-; CHECK-DAG:  ld.param.v2.f32    {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
-; CHECK-DAG:  ld.param.v2.f32    {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_f32_f16_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_f32_f16_param_3];
-;
-; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: mov.b32         {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: mov.b32         {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
-; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG: selp.f32        [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG: selp.f32        [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK-NEXT: st.param.v2.f32    [func_retval0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
 define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
+; CHECK-F16-LABEL: test_select_cc_f32_f16(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-NEXT:    .reg .f32 %f<7>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
+; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.f32 %f5, %f2, %f4, %p2;
+; CHECK-F16-NEXT:    selp.f32 %f6, %f1, %f3, %p1;
+; CHECK-F16-NEXT:    st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_select_cc_f32_f16(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<11>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %f6, %f5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f7, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f8, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %f8, %f7;
+; CHECK-NOF16-NEXT:    selp.f32 %f9, %f2, %f4, %p2;
+; CHECK-NOF16-NEXT:    selp.f32 %f10, %f1, %f3, %p1;
+; CHECK-NOF16-NEXT:    st.param.v2.f32 [func_retval0], {%f10, %f9};
+; CHECK-NOF16-NEXT:    ret;
                                            <2 x half> %c, <2 x half> %d) #0 {
   %cc = fcmp une <2 x half> %c, %d
   %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
   ret <2 x float> %r
 }
 
-; CHECK-LABEL: test_select_cc_f16_f32(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_f16_f32_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_f16_f32_param_1];
-; CHECK-DAG:  ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
-; CHECK-DAG:  ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
-; CHECK-DAG:  setp.neu.f32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
-; CHECK-DAG:  setp.neu.f32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
-; CHECK-NEXT: ret;
 define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f16_f32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
+; CHECK-NEXT:    setp.neu.f32 %p1, %f1, %f3;
+; CHECK-NEXT:    setp.neu.f32 %p2, %f2, %f4;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; CHECK-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
                                           <2 x float> %c, <2 x float> %d) #0 {
   %cc = fcmp une <2 x float> %c, %d
   %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fcmp_une(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_une_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_une_param_1];
-; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.neu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.neu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8  [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8  [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_une(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_une_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_une_param_0];
+; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_une(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_une_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_une_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp une <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_ueq(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ueq_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ueq_param_1];
-; CHECK-F16:  setp.equ.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.equ.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.equ.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_ueq(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_ueq_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_ueq_param_0];
+; CHECK-F16-NEXT:    setp.equ.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_ueq(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ueq_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ueq_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.equ.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.equ.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp ueq <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_ugt(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ugt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ugt_param_1];
-; CHECK-F16:  setp.gtu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.gtu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.gtu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_ugt(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_ugt_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_ugt_param_0];
+; CHECK-F16-NEXT:    setp.gtu.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_ugt(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ugt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ugt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.gtu.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.gtu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp ugt <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_uge(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_uge_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_uge_param_1];
-; CHECK-F16:  setp.geu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.geu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.geu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_uge(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_uge_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_uge_param_0];
+; CHECK-F16-NEXT:    setp.geu.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_uge(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uge_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uge_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.geu.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.geu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp uge <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_ult(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ult_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ult_param_1];
-; CHECK-F16:  setp.ltu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.ltu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.ltu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_ult(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_ult_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_ult_param_0];
+; CHECK-F16-NEXT:    setp.ltu.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_ult(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ult_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ult_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.ltu.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.ltu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp ult <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_ule(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ule_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ule_param_1];
-; CHECK-F16:  setp.leu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.leu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.leu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_ule(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_ule_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_ule_param_0];
+; CHECK-F16-NEXT:    setp.leu.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_ule(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ule_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ule_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.leu.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.leu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp ule <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
 
-; CHECK-LABEL: test_fcmp_uno(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_uno_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_uno_param_1];
-; CHECK-F16:  setp.nan.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.nan.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.nan.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_uno(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_uno_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_uno_param_0];
+; CHECK-F16-NEXT:    setp.nan.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_uno(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uno_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uno_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp uno <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_one(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_one_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_one_param_1];
-; CHECK-F16:  setp.ne.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.ne.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.ne.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_one(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_one_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_one_param_0];
+; CHECK-F16-NEXT:    setp.ne.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_one(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_one_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_one_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.ne.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.ne.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp one <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_oeq(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_oeq_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_oeq_param_1];
-; CHECK-F16:  setp.eq.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.eq.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.eq.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_oeq(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_oeq_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_oeq_param_0];
+; CHECK-F16-NEXT:    setp.eq.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_oeq(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oeq_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oeq_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp oeq <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_ogt(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ogt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ogt_param_1];
-; CHECK-F16:  setp.gt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.gt.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.gt.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_ogt(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_ogt_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_ogt_param_0];
+; CHECK-F16-NEXT:    setp.gt.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_ogt(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ogt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ogt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp ogt <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_oge(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_oge_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_oge_param_1];
-; CHECK-F16:  setp.ge.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.ge.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.ge.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_oge(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_oge_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_oge_param_0];
+; CHECK-F16-NEXT:    setp.ge.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_oge(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oge_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oge_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.ge.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.ge.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp oge <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_olt(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_olt_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_olt_param_1];
-; CHECK-F16:  setp.lt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.lt.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.lt.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_olt(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_olt_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_olt_param_0];
+; CHECK-F16-NEXT:    setp.lt.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_olt(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_olt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_olt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp olt <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; XCHECK-LABEL: test_fcmp_ole(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ole_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ole_param_1];
-; CHECK-F16:  setp.le.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.le.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.le.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_ole(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_ole_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_ole_param_0];
+; CHECK-F16-NEXT:    setp.le.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_ole(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ole_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ole_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.le.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.le.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp ole <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fcmp_ord(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fcmp_ord_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fcmp_ord_param_1];
-; CHECK-F16:  setp.num.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  setp.num.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG:  setp.num.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-NEXT: st.param.b8     [func_retval0], [[R0]];
-; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.b8     [func_retval0+1], [[R1]];
-; CHECK-NEXT: ret;
 define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_fcmp_ord(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .pred %p<3>;
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fcmp_ord_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fcmp_ord_param_0];
+; CHECK-F16-NEXT:    setp.num.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    selp.u16 %rs1, -1, 0, %p1;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-F16-NEXT:    selp.u16 %rs2, -1, 0, %p2;
+; CHECK-F16-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fcmp_ord(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ord_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ord_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.num.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs3;
+; CHECK-NOF16-NEXT:    setp.num.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT:    selp.u16 %rs5, -1, 0, %p2;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
+; CHECK-NOF16-NEXT:    selp.u16 %rs6, -1, 0, %p1;
+; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0+1], %rs6;
+; CHECK-NOF16-NEXT:    ret;
   %r = fcmp ord <2 x half> %a, %b
   ret <2 x i1> %r
 }
 
-; CHECK-LABEL: test_fptosi_i32(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptosi_i32_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
-; CHECK:      st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]}
-; CHECK:      ret;
 define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rzi.s32.f16 %r2, %rs2;
+; CHECK-NEXT:    cvt.rzi.s32.f16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT:    ret;
   %r = fptosi <2 x half> %a to <2 x i32>
   ret <2 x i32> %r
 }
 
-; CHECK-LABEL: test_fptosi_i64(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptosi_i64_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
-; CHECK:      st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]}
-; CHECK:      ret;
 define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rzi.s64.f16 %rd1, %rs2;
+; CHECK-NEXT:    cvt.rzi.s64.f16 %rd2, %rs1;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-NEXT:    ret;
   %r = fptosi <2 x half> %a to <2 x i64>
   ret <2 x i64> %r
 }
 
-; CHECK-LABEL: test_fptoui_2xi32(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptoui_2xi32_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
-; CHECK:      st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]}
-; CHECK:      ret;
 define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rzi.u32.f16 %r2, %rs2;
+; CHECK-NEXT:    cvt.rzi.u32.f16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT:    ret;
   %r = fptoui <2 x half> %a to <2 x i32>
   ret <2 x i32> %r
 }
 
-; CHECK-LABEL: test_fptoui_2xi64(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fptoui_2xi64_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
-; CHECK:      st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]}
-; CHECK:      ret;
 define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rzi.u64.f16 %rd1, %rs2;
+; CHECK-NEXT:    cvt.rzi.u64.f16 %rd2, %rs1;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-NEXT:    ret;
   %r = fptoui <2 x half> %a to <2 x i64>
   ret <2 x i64> %r
 }
 
-; CHECK-LABEL: test_uitofp_2xi32(
-; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
-; CHECK-DAG:  cvt.rn.f16.u32  [[R0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.u32  [[R1:%rs[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-NEXT:    cvt.rn.f16.u32 %rs1, %r2;
+; CHECK-NEXT:    cvt.rn.f16.u32 %rs2, %r1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = uitofp <2 x i32> %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_uitofp_2xi64(
-; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
-; CHECK-DAG:  cvt.rn.f16.u64  [[R0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.u64  [[R1:%rs[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-NEXT:    cvt.rn.f16.u64 %rs1, %rd2;
+; CHECK-NEXT:    cvt.rn.f16.u64 %rs2, %rd1;
+; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = uitofp <2 x i64> %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_sitofp_2xi32(
-; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
-; CHECK-DAG:  cvt.rn.f16.s32  [[R0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.s32  [[R1:%rs[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-NEXT:    cvt.rn.f16.s32 %rs1, %r2;
+; CHECK-NEXT:    cvt.rn.f16.s32 %rs2, %r1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = sitofp <2 x i32> %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_sitofp_2xi64(
-; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
-; CHECK-DAG:  cvt.rn.f16.s64  [[R0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.s64  [[R1:%rs[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-NEXT:    cvt.rn.f16.s64 %rs1, %rd2;
+; CHECK-NEXT:    cvt.rn.f16.s64 %rs2, %rd1;
+; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = sitofp <2 x i64> %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_uitofp_2xi32_fadd(
-; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
-; CHECK-DAG:  cvt.rn.f16.u32  [[C0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.u32  [[C1:%rs[0-9]+]], [[A1]];
 
-; CHECK-F16-DAG:  mov.b32         [[C:%r[0-9]+]], {[[C0]], [[C1]]}
-; CHECK-F16-DAG:  add.rn.f16x2    [[R:%r[0-9]+]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<6>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-F16-NEXT:    cvt.rn.f16.u32 %rs1, %r2;
+; CHECK-F16-NEXT:    cvt.rn.f16.u32 %rs2, %r1;
+; CHECK-F16-NEXT:    mov.b32 %r4, {%rs2, %rs1};
+; CHECK-F16-NEXT:    add.rn.f16x2 %r5, %r3, %r4;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs1, %r1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs2, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NOF16-NEXT:    ret;
   %c = uitofp <2 x i32> %a to <2 x half>
   %r = fadd <2 x half> %b, %c
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_sitofp_2xi32_fadd(
-; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
-; CHECK-DAG:  cvt.rn.f16.s32  [[C0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.s32  [[C1:%rs[0-9]+]], [[A1]];
-;
-; CHECK-F16-DAG:  mov.b32         [[C:%r[0-9]+]], {[[C0]], [[C1]]}
-; CHECK-F16-DAG:  add.rn.f16x2    [[R:%r[0-9]+]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_sitofp_2xi32_fadd(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<6>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
+; CHECK-F16-NEXT:    cvt.rn.f16.s32 %rs1, %r2;
+; CHECK-F16-NEXT:    cvt.rn.f16.s32 %rs2, %r1;
+; CHECK-F16-NEXT:    mov.b32 %r4, {%rs2, %rs1};
+; CHECK-F16-NEXT:    add.rn.f16x2 %r5, %r3, %r4;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_sitofp_2xi32_fadd(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
+; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs1, %r1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs2, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NOF16-NEXT:    ret;
   %c = sitofp <2 x i32> %a to <2 x half>
   %r = fadd <2 x half> %b, %c
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fptrunc_2xfloat(
-; CHECK:      ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptrunc_2xfloat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs1, %f2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f1;
+; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fptrunc_2xdouble(
-; CHECK:      ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
-; CHECK-DAG:  cvt.rn.f16.f64  [[R0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.f16.f64  [[R1:%rs[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-NEXT:    cvt.rn.f16.f64 %rs1, %fd2;
+; CHECK-NEXT:    cvt.rn.f16.f64 %rs2, %fd1;
+; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = fptrunc <2 x double> %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fpext_2xfloat(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fpext_2xfloat_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[R0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[R1:%f[0-9]+]], [[A1]];
-; CHECK-NEXT: st.param.v2.f32 [func_retval0], {[[R0]], [[R1]]};
-; CHECK:      ret;
 define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
+; CHECK-LABEL: test_fpext_2xfloat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT:    ret;
   %r = fpext <2 x half> %a to <2 x float>
   ret <2 x float> %r
 }
 
-; CHECK-LABEL: test_fpext_2xdouble(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fpext_2xdouble_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f64.f16     [[R0:%fd[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f64.f16     [[R1:%fd[0-9]+]], [[A1]];
-; CHECK-NEXT: st.param.v2.f64 [func_retval0], {[[R0]], [[R1]]};
-; CHECK:      ret;
 define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f64.f16 %fd1, %rs2;
+; CHECK-NEXT:    cvt.f64.f16 %fd2, %rs1;
+; CHECK-NEXT:    st.param.v2.f64 [func_retval0], {%fd2, %fd1};
+; CHECK-NEXT:    ret;
   %r = fpext <2 x half> %a to <2 x double>
   ret <2 x double> %r
 }
 
 
-; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
-; CHECK:      st.param.b32 [func_retval0], [[A]]
-; CHECK:      ret;
 define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r2, [test_bitcast_2xhalf_to_2xi16_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = bitcast <2 x half> %a to <2 x i16>
   ret <2 x i16> %r
 }
 
-; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
-; CHECK:      ld.param.u32         [[R:%r[0-9]+]], [test_bitcast_2xi16_to_2xhalf_param_0];
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r2, [test_bitcast_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = bitcast <2 x i16> %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_bitcast_float_to_2xhalf(
-; CHECK: ld.param.f32 	[[AF1:%f[0-9]+]], [test_bitcast_float_to_2xhalf_param_0];
-; CHECK: mov.b32 	[[R:%r[0-9]+]], [[AF1]];
-; CHECK: st.param.b32 	[func_retval0], [[R]];
-; CHECK: ret;
 define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
+; CHECK-LABEL: test_bitcast_float_to_2xhalf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [test_bitcast_float_to_2xhalf_param_0];
+; CHECK-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %r = bitcast float %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_bitcast_2xhalf_to_float(
-; CHECK: ld.param.u32 	[[R:%r[0-9]+]], [test_bitcast_2xhalf_to_float_param_0];
-; CHECK: mov.b32 	[[AF1:%f[0-9]+]], [[R]];
-; CHECK: st.param.f32 	[func_retval0], [[AF1]];
-; CHECK: ret;
 define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xhalf_to_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r2, [test_bitcast_2xhalf_to_float_param_0];
+; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    ret;
   %r = bitcast <2 x half> %a to float
   ret float %r
 }
@@ -1053,19 +1693,25 @@ declare <2 x half> @llvm.round.f16(<2 x half> %a) #0
 declare <2 x half> @llvm.roundeven.f16(<2 x half> %a) #0
 declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
 
-; CHECK-LABEL: test_sqrt(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sqrt_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  sqrt.rn.f32     [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  sqrt.rn.f32     [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_sqrt(<2 x half> %a) #0 {
+; CHECK-LABEL: test_sqrt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT:    sqrt.rn.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
   ret <2 x half> %r
 }
@@ -1077,36 +1723,48 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 {
 ;  ret <2 x half> %r
 ;}
 
-; CHECK-LABEL: test_sin(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sin_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  sin.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  sin.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
+; CHECK-LABEL: test_sin(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sin_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    sin.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT:    sin.approx.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_cos(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_cos_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cos.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  cos.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
+; CHECK-LABEL: test_cos(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_cos_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    cos.approx.f32 %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NEXT:    cos.approx.f32 %f4, %f3;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
   ret <2 x half> %r
 }
@@ -1153,355 +1811,579 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
 ;  ret <2 x half> %r
 ;}
 
-; CHECK-LABEL: test_fma(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fma_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fma_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_fma_param_2];
-;
-; CHECK-F16:        fma.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret
+
 define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+; CHECK-F16-LABEL: test_fma(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<5>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_fma_param_2];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fma_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fma_param_0];
+; CHECK-F16-NEXT:    fma.rn.f16x2 %r4, %r1, %r2, %r3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fma(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<9>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fma_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fma_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fma_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-NOF16-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %f4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f7, %rs5;
+; CHECK-NOF16-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %f8;
+; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK-NOF16:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-NOF16-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-NOF16-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-NOF16-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK-NOF16:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-F16:        and.b32         [[R:%r[0-9]+]], [[A]], 2147450879;
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_fabs(<2 x half> %a) #0 {
+; CHECK-F16-LABEL: test_fabs(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<5>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
+; CHECK-F16-NEXT:    and.b32 %r3, %r1, 2147450879;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fabs(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs1;
+; CHECK-NOF16-NEXT:    abs.f32 %f4, %f3;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs4, %f4;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_minnum(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_minnum_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
-; CHECK-DAG:  min.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
-; CHECK-DAG:  min.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-LABEL: test_minnum(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r2, [test_minnum_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_minnum_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NEXT:    min.f32 %f6, %f5, %f4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_maxnum(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_maxnum_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
-; CHECK-DAG:  max.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
-; CHECK-DAG:  max.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-LABEL: test_maxnum(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r2, [test_maxnum_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_maxnum_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NEXT:    max.f32 %f6, %f5, %f4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_copysign(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_param_1];
-; CHECK-NOF16-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[A0]], 32767;
-; CHECK-NOF16-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[A1]], 32767;
-; CHECK-NOF16-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[B0]], -32768;
-; CHECK-NOF16-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[B1]], -32768;
-; CHECK-NOF16-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-NOF16-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-NOF16-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-F16-DAG:    and.b32         [[R0:%r[0-9]+]], [[B]], -2147450880;
-; CHECK-F16-DAG:    and.b32         [[R1:%r[0-9]+]], [[A]], 2147450879;
-; CHECK-F16-DAG:    or.b32          [[R:%r[0-9]+]], [[R1]], [[R0]]
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_copysign(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
+; CHECK-F16-NEXT:    and.b32 %r4, %r2, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r4;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_copysign(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<17>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    and.b16 %rs4, %rs2, -32768;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs6, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs9, %rs8, %rs4;
+; CHECK-NOF16-NEXT:    and.b16 %rs12, %rs1, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs14, %rs5, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs15, %rs14, %rs12;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs15, %rs9};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_copysign_f32(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_f32_param_0];
-; CHECK-DAG:  ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
-; CHECK-NOF16-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32         [[BI0:%r[0-9]+]], [[B0]];
-; CHECK-NOF16-DAG:  mov.b32         [[BI1:%r[0-9]+]], [[B1]];
-; CHECK-NOF16-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[A0]], 32767;
-; CHECK-NOF16-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[A1]], 32767;
-; CHECK-NOF16-DAG:  and.b32         [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
-; CHECK-NOF16-DAG:  and.b32         [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
-; CHECK-NOF16-DAG:  mov.b32         {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; }
-; CHECK-NOF16-DAG:  mov.b32         {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; }
-; CHECK-NOF16-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-NOF16-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-NOF16-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-F16-DAG:    cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[B1]];
-; CHECK-F16-DAG:    cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[B0]];
-; CHECK-F16-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[R1]], [[R0]]};
-; CHECK-F16-DAG:    and.b32         [[R3:%r[0-9]+]], [[R2]], -2147450880;
-; CHECK-F16-DAG:    and.b32         [[R4:%r[0-9]+]], [[A]], 2147450879;
-; CHECK-F16-DAG:    or.b32          [[R:%r[0-9]+]], [[R4]], [[R3]]
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
+; CHECK-F16-LABEL: test_copysign_f32(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs1, %f2;
+; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs2, %f1;
+; CHECK-F16-NEXT:    mov.b32 %r2, {%rs2, %rs1};
+; CHECK-F16-NEXT:    and.b32 %r4, %r2, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r4;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_copysign_f32(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<13>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    and.b32 %r3, %r2, -2147483648;
+; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r3; }
+; CHECK-NOF16-NEXT:    mov.b32 {%rs2, %rs3}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs5, %rs3, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs6, %rs5, %rs1;
+; CHECK-NOF16-NEXT:    mov.b32 %r4, %f1;
+; CHECK-NOF16-NEXT:    and.b32 %r5, %r4, -2147483648;
+; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
+; CHECK-NOF16-NEXT:    and.b16 %rs10, %rs2, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs11, %rs10, %rs8;
+; CHECK-NOF16-NEXT:    mov.b32 %r6, {%rs11, %rs6};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NOF16-NEXT:    ret;
   %tb = fptrunc <2 x float> %b to <2 x half>
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_copysign_f64(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_f64_param_0];
-; CHECK-DAG:  ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
-; CHECK-NOF16-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b64         [[BI0:%rd[0-9]+]], [[B0]];
-; CHECK-NOF16-DAG:  mov.b64         [[BI1:%rd[0-9]+]], [[B1]];
-; CHECK-NOF16-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[A0]], 32767;
-; CHECK-NOF16-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[A1]], 32767;
-; CHECK-NOF16-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
-; CHECK-NOF16-DAG:  and.b64         [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
-; CHECK-NOF16-DAG:  shr.u64         [[BY0:%rd[0-9]+]], [[BX0]], 48;
-; CHECK-NOF16-DAG:  shr.u64         [[BY1:%rd[0-9]+]], [[BX1]], 48;
-; CHECK-NOF16-DAG:  cvt.u16.u64     [[BZ0:%rs[0-9]+]], [[BY0]];
-; CHECK-NOF16-DAG:  cvt.u16.u64     [[BZ1:%rs[0-9]+]], [[BY1]];
-; CHECK-NOF16-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-NOF16-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-NOF16-DAG:  mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-F16-DAG:      cvt.rn.f16.f64  [[R0:%rs[0-9]+]], [[B1]];
-; CHECK-F16-DAG:      cvt.rn.f16.f64  [[R1:%rs[0-9]+]], [[B0]];
-; CHECK-F16-DAG:      mov.b32         [[R2:%r[0-9]+]], {[[R1]], [[R0]]};
-; CHECK-F16-DAG:      and.b32         [[R3:%r[0-9]+]], [[R2]], -2147450880;
-; CHECK-F16-DAG:      and.b32         [[R4:%r[0-9]+]], [[A]], 2147450879;
-; CHECK-F16-DAG:      or.b32          [[R:%r[0-9]+]], [[R4]], [[R3]];
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
+; CHECK-F16-LABEL: test_copysign_f64(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-NEXT:    .reg .f64 %fd<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
+; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs1, %fd2;
+; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs2, %fd1;
+; CHECK-F16-NEXT:    mov.b32 %r2, {%rs2, %rs1};
+; CHECK-F16-NEXT:    and.b32 %r4, %r2, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r4;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_copysign_f64(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<13>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .b64 %rd<7>;
+; CHECK-NOF16-NEXT:    .reg .f64 %fd<3>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs4, %rs2, 32767;
+; CHECK-NOF16-NEXT:    mov.b64 %rd1, %fd2;
+; CHECK-NOF16-NEXT:    and.b64 %rd2, %rd1, -9223372036854775808;
+; CHECK-NOF16-NEXT:    shr.u64 %rd3, %rd2, 48;
+; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs5, %rd3;
+; CHECK-NOF16-NEXT:    or.b16 %rs6, %rs4, %rs5;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs1, 32767;
+; CHECK-NOF16-NEXT:    mov.b64 %rd4, %fd1;
+; CHECK-NOF16-NEXT:    and.b64 %rd5, %rd4, -9223372036854775808;
+; CHECK-NOF16-NEXT:    shr.u64 %rd6, %rd5, 48;
+; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs10, %rd6;
+; CHECK-NOF16-NEXT:    or.b16 %rs11, %rs9, %rs10;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs11, %rs6};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOF16-NEXT:    ret;
   %tb = fptrunc <2 x double> %b to <2 x half>
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_copysign_extended(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_extended_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_extended_param_1];
-; CHECK-NOF16-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[A0]], 32767;
-; CHECK-NOF16-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[A1]], 32767;
-; CHECK-NOF16-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[B0]], -32768;
-; CHECK-NOF16-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[B1]], -32768;
-; CHECK-NOF16-DAG:  or.b16          [[R0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-NOF16-DAG:  or.b16          [[R1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-NOF16-DAG:  cvt.f32.f16     [[XR0:%f[0-9]+]], [[R0]];
-; CHECK-NOF16-DAG:  cvt.f32.f16     [[XR1:%f[0-9]+]], [[R1]];
-; CHECK-F16-DAG:    and.b32         [[R0:%r[0-9]+]], [[B]], -2147450880;
-; CHECK-F16-DAG:    and.b32         [[R1:%r[0-9]+]], [[A]], 2147450879;
-; CHECK-F16-DAG:    or.b32          [[R2:%r[0-9]+]], [[R1]], [[R0]]
-; CHECK-F16-DAG:    mov.b32         {[[R3:%rs[0-9]+]], [[R4:%rs[0-9]+]]}, [[R2]]
-; CHECK-F16-DAG:    cvt.f32.f16     [[XR0:%f[0-9]+]], [[R3]]
-; CHECK-F16-DAG:    cvt.f32.f16     [[XR1:%f[0-9]+]], [[R4]]
-; CHECK:      st.param.v2.f32 [func_retval0], {[[XR0]], [[XR1]]};
-; CHECK:      ret;
 define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
+; CHECK-F16-LABEL: test_copysign_extended(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_extended_param_0];
+; CHECK-F16-NEXT:    and.b32 %r4, %r2, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r4;
+; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-F16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-F16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-F16-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_copysign_extended(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<17>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_extended_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    and.b16 %rs4, %rs1, -32768;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs5, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs9, %rs8, %rs4;
+; CHECK-NOF16-NEXT:    and.b16 %rs12, %rs2, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs14, %rs6, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs15, %rs14, %rs12;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs15;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs9;
+; CHECK-NOF16-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
   %xr = fpext <2 x half> %r to <2 x float>
   ret <2 x float> %xr
 }
 
-; CHECK-LABEL: test_floor(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_floor_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rmi.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rmi.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_floor(<2 x half> %a) #0 {
+; CHECK-LABEL: test_floor(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rmi.f16.f16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rmi.f16.f16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.floor.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_ceil(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_ceil_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rpi.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rpi.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_ceil(<2 x half> %a) #0 {
+; CHECK-LABEL: test_ceil(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rpi.f16.f16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rpi.f16.f16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_trunc(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_trunc_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rzi.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rzi.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_trunc(<2 x half> %a) #0 {
+; CHECK-LABEL: test_trunc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rzi.f16.f16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rzi.f16.f16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_rint(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_rint_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_rint(<2 x half> %a) #0 {
+; CHECK-LABEL: test_rint(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.rint.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_nearbyint(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_nearbyint_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
+; CHECK-LABEL: test_nearbyint(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_nearbyint_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_roundeven(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_roundeven_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_roundeven(<2 x half> %a) #0 {
+; CHECK-LABEL: test_roundeven(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_roundeven_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.roundeven.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_round(
-; CHECK:      ld.param.b32    {{.*}}, [test_round_param_0];
 ; check the use of sign mask and 0.5 to implement round
-; CHECK:      and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK:      or.b32 {{.*}}, [[R1]], 1056964608;
-; CHECK:      and.b32 [[R2:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK:      or.b32 {{.*}}, [[R2]], 1056964608;
-; CHECK:      st.param.b32    [func_retval0], {{.*}};
-; CHECK:      ret;
 define <2 x half> @test_round(<2 x half> %a) #0 {
+; CHECK-LABEL: test_round(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .f32 %f<17>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_round_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT:    mov.b32 %r2, %f1;
+; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
+; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
+; CHECK-NEXT:    mov.b32 %f2, %r4;
+; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
+; CHECK-NEXT:    abs.f32 %f5, %f1;
+; CHECK-NEXT:    setp.gt.f32 %p1, %f5, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
+; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %f8;
+; CHECK-NEXT:    cvt.f32.f16 %f9, %rs1;
+; CHECK-NEXT:    mov.b32 %r5, %f9;
+; CHECK-NEXT:    and.b32 %r6, %r5, -2147483648;
+; CHECK-NEXT:    or.b32 %r7, %r6, 1056964608;
+; CHECK-NEXT:    mov.b32 %f10, %r7;
+; CHECK-NEXT:    add.rn.f32 %f11, %f9, %f10;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f12, %f11;
+; CHECK-NEXT:    abs.f32 %f13, %f9;
+; CHECK-NEXT:    setp.gt.f32 %p3, %f13, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %f14, %f9, %f12, %p3;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f15, %f9;
+; CHECK-NEXT:    setp.lt.f32 %p4, %f13, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %f16, %f15, %f14, %p4;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs4, %f16;
+; CHECK-NEXT:    mov.b32 %r8, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_fmuladd_param_2];
-;
-; CHECK-F16:        fma.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG:  mov.b32        {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG:  mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; CHECK-NOF16:      mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK:      st.param.b32    [func_retval0], [[R]];
-; CHECK:      ret;
 define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+; CHECK-F16-LABEL: test_fmuladd(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<5>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_2];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_0];
+; CHECK-F16-NEXT:    fma.rn.f16x2 %r4, %r1, %r2, %r3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-NOF16-LABEL: test_fmuladd(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<9>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-NOF16-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %f4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f7, %rs5;
+; CHECK-NOF16-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %f8;
+; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_shufflevector(
-; CHECK: mov.b32 {%rs1, %rs2}, %r1;
-; CHECK: mov.b32 %r2, {%rs2, %rs1};
 define <2 x half> @test_shufflevector(<2 x half> %a) #0 {
+; CHECK-LABEL: test_shufflevector(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
   ret <2 x half> %s
 }
 
-; CHECK-LABEL: test_insertelement(
-; CHECK: mov.b32 {%rs2, tmp}, %r1;
-; CHECK: mov.b32 %r2, {%rs2, %rs1};
 define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 {
+; CHECK-LABEL: test_insertelement(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
+; CHECK-NEXT:    mov.b32 %r2, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %i = insertelement <2 x half> %a, half %x, i64 1
   ret <2 x half> %i
 }
 
-; CHECK-LABEL: test_sitofp_2xi16_to_2xhalf(
-; CHECK:      cvt.rn.f16.s16
-; CHECK:      cvt.rn.f16.s16
-; CHECK:      ret;
 define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_sitofp_2xi16_to_2xhalf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rn.f16.s16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rn.f16.s16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = sitofp <2 x i16> %a to <2 x half>
   ret <2 x half> %r
 }
 
-; CHECK-LABEL: test_uitofp_2xi16_to_2xhalf(
-; CHECK:      cvt.rn.f16.u16
-; CHECK:      cvt.rn.f16.u16
-; CHECK:      ret;
 define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_uitofp_2xi16_to_2xhalf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rn.f16.u16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rn.f16.u16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = uitofp <2 x i16> %a to <2 x half>
   ret <2 x half> %r
 }
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 988438bebea6d..388bd314801fc 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -1,262 +1,381 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; ## Support i16x2 instructions
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80        \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,I16x2 %s
 ; RUN: %if ptxas %{                                                           \
-; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90                    \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN:   | %ptxas-verify -arch=sm_90                                          \
 ; RUN: %}
 ; ## No support for i16x2 instructions
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53                      \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,NO-I16x2 %s
 ; RUN: %if ptxas %{                                                           \
-; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53                    \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN:   | %ptxas-verify -arch=sm_53                                          \
 ; RUN: %}
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
-; COMMON-LABEL: test_ret_const(
-; COMMON:     mov.b32         [[R:%r[0-9+]]], 131073;
-; COMMON:     st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_ret_const() #0 {
+; COMMON-LABEL: test_ret_const(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<2>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    mov.b32 %r1, 131073;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
+; COMMON-NEXT:    ret;
   ret <2 x i16> <i16 1, i16 2>
 }
 
-; COMMON-LABEL: test_extract_0(
-; COMMON:      ld.param.u32   [[A:%r[0-9]+]], [test_extract_0_param_0];
-; COMMON:      mov.b32        {[[RS:%rs[0-9]+]], tmp}, [[A]];
-; COMMON:      cvt.u32.u16    [[R:%r[0-9]+]], [[RS]];
-; COMMON:      st.param.b32    [func_retval0], [[R]];
-; COMMON:      ret;
 define i16 @test_extract_0(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_extract_0(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<2>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_0_param_0];
+; COMMON-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
+; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i32 0
   ret i16 %e
 }
 
-; COMMON-LABEL: test_extract_1(
-; COMMON:      ld.param.u32   [[A:%r[0-9]+]], [test_extract_1_param_0];
-; COMMON:      mov.b32        {tmp, [[RS:%rs[0-9]+]]}, [[A]];
-; COMMON:      cvt.u32.u16    [[R:%r[0-9]+]], [[RS]];
-; COMMON:      st.param.b32    [func_retval0], [[R]];
-; COMMON:      ret;
 define i16 @test_extract_1(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_extract_1(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<2>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_1_param_0];
+; COMMON-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i32 1
   ret i16 %e
 }
 
-; COMMON-LABEL: test_extract_i(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_extract_i_param_0];
-; COMMON-DAG:  ld.param.u64    [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
-; COMMON-DAG:  setp.eq.s64     [[PRED:%p[0-9]+]], [[IDX]], 0;
-; COMMON-DAG:  mov.b32         {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]];
-; COMMON:      selp.b16        [[RS:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]];
-; COMMON:      cvt.u32.u16     [[R:%r[0-9]+]], [[RS]];
-; COMMON:      st.param.b32    [func_retval0], [[R]];
-; COMMON:      ret;
 define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 {
+; COMMON-LABEL: test_extract_i(
+; COMMON:       {
+; COMMON-NEXT:    .reg .pred %p<2>;
+; COMMON-NEXT:    .reg .b16 %rs<4>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-NEXT:    .reg .b64 %rd<2>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u64 %rd1, [test_extract_i_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_i_param_0];
+; COMMON-NEXT:    setp.eq.s64 %p1, %rd1, 0;
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
+; COMMON-NEXT:    cvt.u32.u16 %r2, %rs3;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i64 %idx
   ret i16 %e
 }
 
-; COMMON-LABEL: test_add(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_add_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_add_param_1];
-;
-; I16x2-NEXT:  add.s16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-;
-; NO-I16x2-DAG: mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-; NO-I16x2-DAG: mov.b32 	{[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
-; NO-I16x2-DAG: add.s16 	[[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
-; NO-I16x2-DAG: add.s16 	[[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
-; NO-I16x2-DAG: mov.b32 	[[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
+; I16x2-LABEL: test_add(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b32 %r<4>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.u32 %r2, [test_add_param_1];
+; I16x2-NEXT:    ld.param.u32 %r1, [test_add_param_0];
+; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_add(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
+; NO-I16x2-NEXT:    .reg .b32 %r<4>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_add_param_1];
+; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    add.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    add.s16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; NO-I16x2-NEXT:    ret;
   %r = add <2 x i16> %a, %b
   ret <2 x i16> %r
 }
 
 ; Check that we can lower add with immediate arguments.
-; COMMON-LABEL: test_add_imm_0(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_add_imm_0_param_0];
-;
-; I16x2:        mov.b32        [[I:%r[0-9+]]], 131073;
-; I16x2:        add.s16x2      [[R:%r[0-9]+]], [[A]], [[I]];
-;
-;	NO-I16x2-DAG: mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-;	NO-I16x2-DAG: add.s16 	[[RS2:%rs[0-9]+]], [[RS0]], 1;
-;	NO-I16x2-DAG: add.s16 	[[RS3:%rs[0-9]+]], [[RS1]], 2;
-;	NO-I16x2-DAG: mov.b32 	[[R:%r[0-9]+]], {[[RS2]], [[RS3]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
+; I16x2-LABEL: test_add_imm_0(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b32 %r<4>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
+; I16x2-NEXT:    mov.b32 %r2, 131073;
+; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_add_imm_0(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<5>;
+; NO-I16x2-NEXT:    .reg .b32 %r<3>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
+; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
+; NO-I16x2-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; NO-I16x2-NEXT:    ret;
   %r = add <2 x i16> <i16 1, i16 2>, %a
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_add_imm_1(
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_add_imm_1_param_0];
-;
-; I16x2:        mov.b32        [[I:%r[0-9+]]], 131073;
-; I16x2:        add.s16x2      [[R:%r[0-9]+]], [[A]], [[I]];
-;
-;	NO-I16x2-DAG: mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-;	NO-I16x2-DAG: add.s16 	[[RS2:%rs[0-9]+]], [[RS0]], 1;
-;	NO-I16x2-DAG: add.s16 	[[RS3:%rs[0-9]+]], [[RS1]], 2;
-;	NO-I16x2-DAG: mov.b32 	[[R:%r[0-9]+]], {[[RS2]], [[RS3]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
+; I16x2-LABEL: test_add_imm_1(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b32 %r<4>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
+; I16x2-NEXT:    mov.b32 %r2, 131073;
+; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_add_imm_1(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<5>;
+; NO-I16x2-NEXT:    .reg .b32 %r<3>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
+; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
+; NO-I16x2-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; NO-I16x2-NEXT:    ret;
   %r = add <2 x i16> %a, <i16 1, i16 2>
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_sub(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_sub_param_0];
-;
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_sub_param_1];
-;
-; COMMON-DAG:  mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-; COMMON-DAG:  mov.b32 	{[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
-; COMMON-DAG:  sub.s16 	[[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
-; COMMON-DAG:  sub.s16 	[[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
-; COMMON-DAG:  mov.b32 	[[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 {
+; COMMON-LABEL: test_sub(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<7>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    sub.s16 %rs5, %rs4, %rs2;
+; COMMON-NEXT:    sub.s16 %rs6, %rs3, %rs1;
+; COMMON-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %r = sub <2 x i16> %a, %b
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_smax(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_smax_param_0];
-;
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_smax_param_1];
-; I16x2:   max.s16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-;
-;	NO-I16x2-DAG: mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-;	NO-I16x2-DAG: mov.b32 	{[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
-;	NO-I16x2-DAG: max.s16 	[[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
-;	NO-I16x2-DAG: max.s16 	[[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
-;	NO-I16x2-DAG: mov.b32 	[[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
+; I16x2-LABEL: test_smax(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b32 %r<4>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
+; I16x2-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
+; I16x2-NEXT:    max.s16x2 %r3, %r1, %r2;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_smax(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
+; NO-I16x2-NEXT:    .reg .b32 %r<4>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
+; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    max.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    max.s16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; NO-I16x2-NEXT:    ret;
   %cmp = icmp sgt <2 x i16> %a, %b
   %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_umax(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_umax_param_0];
-;
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_umax_param_1];
-; I16x2:   max.u16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-;
-;	NO-I16x2-DAG: mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-;	NO-I16x2-DAG: mov.b32 	{[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
-;	NO-I16x2-DAG: max.u16 	[[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
-;	NO-I16x2-DAG: max.u16 	[[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
-;	NO-I16x2-DAG: mov.b32 	[[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
+; I16x2-LABEL: test_umax(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b32 %r<4>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
+; I16x2-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
+; I16x2-NEXT:    max.u16x2 %r3, %r1, %r2;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_umax(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
+; NO-I16x2-NEXT:    .reg .b32 %r<4>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
+; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    max.u16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    max.u16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; NO-I16x2-NEXT:    ret;
   %cmp = icmp ugt <2 x i16> %a, %b
   %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_smin(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_smin_param_0];
-;
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_smin_param_1];
-; I16x2:   min.s16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-;
-;	NO-I16x2-DAG: mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-;	NO-I16x2-DAG: mov.b32 	{[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
-;	NO-I16x2-DAG: min.s16 	[[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
-;	NO-I16x2-DAG: min.s16 	[[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
-;	NO-I16x2-DAG: mov.b32 	[[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
+; I16x2-LABEL: test_smin(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b32 %r<4>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
+; I16x2-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
+; I16x2-NEXT:    min.s16x2 %r3, %r1, %r2;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_smin(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
+; NO-I16x2-NEXT:    .reg .b32 %r<4>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
+; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    min.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    min.s16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; NO-I16x2-NEXT:    ret;
   %cmp = icmp sle <2 x i16> %a, %b
   %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_umin(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_umin_param_0];
-;
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_umin_param_1];
-; I16x2:   min.u16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-;
-;	NO-I16x2-DAG: mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-;	NO-I16x2-DAG: mov.b32 	{[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
-;	NO-I16x2-DAG: min.u16 	[[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
-;	NO-I16x2-DAG: min.u16 	[[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
-;	NO-I16x2-DAG: mov.b32 	[[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
+; I16x2-LABEL: test_umin(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b32 %r<4>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
+; I16x2-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
+; I16x2-NEXT:    min.u16x2 %r3, %r1, %r2;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_umin(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
+; NO-I16x2-NEXT:    .reg .b32 %r<4>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
+; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    min.u16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    min.u16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
+; NO-I16x2-NEXT:    ret;
   %cmp = icmp ule <2 x i16> %a, %b
   %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_mul(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_mul_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_mul_param_1];
-;
-;	COMMON-DAG: mov.b32 	    {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]];
-;	COMMON-DAG: mov.b32 	    {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]];
-;	COMMON-DAG: mul.lo.s16 	[[RS4:%rs[0-9]+]], [[RS0]], [[RS2]];
-;	COMMON-DAG: mul.lo.s16 	[[RS5:%rs[0-9]+]], [[RS1]], [[RS3]];
-;	COMMON-DAG: mov.b32 	    [[R:%r[0-9]+]], {[[RS4]], [[RS5]]};
-;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
+; COMMON-LABEL: test_mul(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<7>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    mul.lo.s16 %rs5, %rs4, %rs2;
+; COMMON-NEXT:    mul.lo.s16 %rs6, %rs3, %rs1;
+; COMMON-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %r = mul <2 x i16> %a, %b
   ret <2 x i16> %r
 }
 
 ;; Logical ops are available on all GPUs as regular 32-bit logical ops
-; COMMON-LABEL: test_or(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_or_param_1];
-; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], [[B]];
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
+; COMMON-LABEL: test_or(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<7>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r3, [test_or_param_1];
+; COMMON-NEXT:    ld.param.u32 %r4, [test_or_param_0];
+; COMMON-NEXT:    or.b32 %r5, %r4, %r3;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r5;
+; COMMON-NEXT:    ret;
   %r = or <2 x i16> %a, %b
   ret <2 x i16> %r
 }
 
 ; Ops that operate on computed arguments go though a different lowering path.
 ; compared to the ones that operate on loaded data. So we test them separately.
-; COMMON-LABEL: test_or_computed(
-; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_or_computed_param_0];
-; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
-; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
-; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
-; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
-; COMMON:        or.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
-; COMMON-NEXT:   st.param.b32    [func_retval0], [[R]];
 define <2 x i16> @test_or_computed(i16 %a) {
+; COMMON-LABEL: test_or_computed(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<4>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u16 %rs1, [test_or_computed_param_0];
+; COMMON-NEXT:    mov.u16 %rs2, 0;
+; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; COMMON-NEXT:    mov.u16 %rs3, 5;
+; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
+; COMMON-NEXT:    or.b32 %r3, %r2, %r1;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
   %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
   %r = or <2 x i16> %ins.1, %ins.0
@@ -264,46 +383,64 @@ define <2 x i16> @test_or_computed(i16 %a) {
 }
 
 ; Check that we can lower or with immediate arguments.
-; COMMON-LABEL: test_or_imm_0(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_imm_0_param_0];
-; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], 131073;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_or_imm_0(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_or_imm_0_param_0];
+; COMMON-NEXT:    or.b32 %r2, %r1, 131073;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = or <2 x i16> <i16 1, i16 2>, %a
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_or_imm_1(
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_or_imm_1_param_0];
-; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], 131073;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_or_imm_1(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_or_imm_1_param_0];
+; COMMON-NEXT:    or.b32 %r2, %r1, 131073;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = or <2 x i16> %a, <i16 1, i16 2>
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_xor(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_xor_param_1];
-; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], [[B]];
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
+; COMMON-LABEL: test_xor(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<7>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r3, [test_xor_param_1];
+; COMMON-NEXT:    ld.param.u32 %r4, [test_xor_param_0];
+; COMMON-NEXT:    xor.b32 %r5, %r4, %r3;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r5;
+; COMMON-NEXT:    ret;
   %r = xor <2 x i16> %a, %b
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_xor_computed(
-; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_xor_computed_param_0];
-; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
-; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
-; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
-; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
-; COMMON:        xor.b32         [[R:%r[0-9]+]], [[R2]], [[R1]];
-; COMMON-NEXT:   st.param.b32    [func_retval0], [[R]];
 define <2 x i16> @test_xor_computed(i16 %a) {
+; COMMON-LABEL: test_xor_computed(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<4>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u16 %rs1, [test_xor_computed_param_0];
+; COMMON-NEXT:    mov.u16 %rs2, 0;
+; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; COMMON-NEXT:    mov.u16 %rs3, 5;
+; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
+; COMMON-NEXT:    xor.b32 %r3, %r2, %r1;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
   %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
   %r = xor <2 x i16> %ins.1, %ins.0
@@ -311,48 +448,66 @@ define <2 x i16> @test_xor_computed(i16 %a) {
 }
 
 ; Check that we can lower xor with immediate arguments.
-; COMMON-LABEL: test_xor_imm_0(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_imm_0_param_0];
-; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], 131073;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_xor_imm_0(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_imm_0_param_0];
+; COMMON-NEXT:    xor.b32 %r2, %r1, 131073;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = xor <2 x i16> <i16 1, i16 2>, %a
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_xor_imm_1(
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_xor_imm_1_param_0];
-; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], 131073;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_xor_imm_1(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_imm_1_param_0];
+; COMMON-NEXT:    xor.b32 %r2, %r1, 131073;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = xor <2 x i16> %a, <i16 1, i16 2>
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_and(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_and_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_and_param_1];
-; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], [[B]];
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
+; COMMON-LABEL: test_and(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<7>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r3, [test_and_param_1];
+; COMMON-NEXT:    ld.param.u32 %r4, [test_and_param_0];
+; COMMON-NEXT:    and.b32 %r5, %r4, %r3;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r5;
+; COMMON-NEXT:    ret;
   %r = and <2 x i16> %a, %b
   ret <2 x i16> %r
 }
 
 ; Ops that operate on computed arguments go though a different lowering path.
 ; compared to the ones that operate on loaded data. So we test them separately.
-; COMMON-LABEL: test_and_computed(
-; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_and_computed_param_0];
-; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
-; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
-; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
-; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
-; COMMON:        and.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
-; COMMON-NEXT:   st.param.b32    [func_retval0], [[R]];
 define <2 x i16> @test_and_computed(i16 %a) {
+; COMMON-LABEL: test_and_computed(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<4>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u16 %rs1, [test_and_computed_param_0];
+; COMMON-NEXT:    mov.u16 %rs2, 0;
+; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; COMMON-NEXT:    mov.u16 %rs3, 5;
+; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
+; COMMON-NEXT:    and.b32 %r3, %r2, %r1;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
   %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
   %r = and <2 x i16> %ins.1, %ins.0
@@ -360,74 +515,102 @@ define <2 x i16> @test_and_computed(i16 %a) {
 }
 
 ; Check that we can lower and with immediate arguments.
-; COMMON-LABEL: test_and_imm_0(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_and_imm_0_param_0];
-; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], 131073;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_and_imm_0(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_and_imm_0_param_0];
+; COMMON-NEXT:    and.b32 %r2, %r1, 131073;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = and <2 x i16> <i16 1, i16 2>, %a
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_and_imm_1(
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_and_imm_1_param_0];
-; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], 131073;
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_and_imm_1(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_and_imm_1_param_0];
+; COMMON-NEXT:    and.b32 %r2, %r1, 131073;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = and <2 x i16> %a, <i16 1, i16 2>
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: .func test_ldst_v2i16(
-; COMMON-DAG:    ld.param.u64    [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0];
-; COMMON-DAG:    ld.param.u64    [[B:%rd[0-9]+]], [test_ldst_v2i16_param_1];
-; COMMON-DAG:    ld.u32          [[E:%r[0-9]+]], [[[A]]];
-; COMMON-DAG:    st.u32          [[[B]]], [[E]];
-; COMMON:        ret;
 define void @test_ldst_v2i16(ptr %a, ptr %b) {
+; COMMON-LABEL: test_ldst_v2i16(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<2>;
+; COMMON-NEXT:    .reg .b64 %rd<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v2i16_param_1];
+; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v2i16_param_0];
+; COMMON-NEXT:    ld.u32 %r1, [%rd1];
+; COMMON-NEXT:    st.u32 [%rd2], %r1;
+; COMMON-NEXT:    ret;
   %t1 = load <2 x i16>, ptr %a
   store <2 x i16> %t1, ptr %b, align 16
   ret void
 }
 
-; COMMON-LABEL: .func test_ldst_v3i16(
-; COMMON-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v3i16_param_0];
-; COMMON-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v3i16_param_1];
 ; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
 ;    number of bitshifting instructions that may change at llvm's whim.
 ;    So we only verify that we only issue correct number of writes using
 ;    correct offset, but not the values we write.
-; COMMON-DAG:    ld.u64
-; COMMON-DAG:    st.u32          [%[[B]]],
-; COMMON-DAG:    st.u16          [%[[B]]+4],
-; COMMON:        ret;
 define void @test_ldst_v3i16(ptr %a, ptr %b) {
+; COMMON-LABEL: test_ldst_v3i16(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b64 %rd<5>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v3i16_param_1];
+; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i16_param_0];
+; COMMON-NEXT:    ld.u64 %rd3, [%rd1];
+; COMMON-NEXT:    shr.u64 %rd4, %rd3, 32;
+; COMMON-NEXT:    st.u32 [%rd2], %rd3;
+; COMMON-NEXT:    st.u16 [%rd2+4], %rd4;
+; COMMON-NEXT:    ret;
   %t1 = load <3 x i16>, ptr %a
   store <3 x i16> %t1, ptr %b, align 16
   ret void
 }
 
-; COMMON-LABEL: .func test_ldst_v4i16(
-; COMMON-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v4i16_param_0];
-; COMMON-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v4i16_param_1];
-; COMMON-DAG:    ld.v4.u16       {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [%[[A]]];
-; COMMON-DAG:    st.v4.u16       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; COMMON:        ret;
 define void @test_ldst_v4i16(ptr %a, ptr %b) {
+; COMMON-LABEL: test_ldst_v4i16(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<5>;
+; COMMON-NEXT:    .reg .b64 %rd<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v4i16_param_1];
+; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v4i16_param_0];
+; COMMON-NEXT:    ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; COMMON-NEXT:    st.v4.u16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
+; COMMON-NEXT:    ret;
   %t1 = load <4 x i16>, ptr %a
   store <4 x i16> %t1, ptr %b, align 16
   ret void
 }
 
-; COMMON-LABEL: .func test_ldst_v8i16(
-; COMMON-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v8i16_param_0];
-; COMMON-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v8i16_param_1];
-; COMMON-DAG:    ld.v4.b32       {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
-; COMMON-DAG:    st.v4.b32       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; COMMON:        ret;
 define void @test_ldst_v8i16(ptr %a, ptr %b) {
+; COMMON-LABEL: test_ldst_v8i16(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<5>;
+; COMMON-NEXT:    .reg .b64 %rd<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v8i16_param_1];
+; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v8i16_param_0];
+; COMMON-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; COMMON-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; COMMON-NEXT:    ret;
   %t1 = load <8 x i16>, ptr %a
   store <8 x i16> %t1, ptr %b, align 16
   ret void
@@ -435,139 +618,185 @@ define void @test_ldst_v8i16(ptr %a, ptr %b) {
 
 declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0
 
-; COMMON-LABEL: test_call(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_call_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_call_param_1];
-; COMMON:      {
-; COMMON-DAG:  .param .align 4 .b8 param0[4];
-; COMMON-DAG:  .param .align 4 .b8 param1[4];
-; COMMON-DAG:  st.param.b32    [param0], [[A]];
-; COMMON-DAG:  st.param.b32    [param1], [[B]];
-; COMMON-DAG:  .param .align 4 .b8 retval0[4];
-; COMMON:      call.uni (retval0),
-; COMMON-NEXT:        test_callee,
-; COMMON:      );
-; COMMON-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0];
-; COMMON-NEXT: }
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
+; COMMON-LABEL: test_call(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<5>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r2, [test_call_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_call_param_0];
+; COMMON-NEXT:    { // callseq 0, 0
+; COMMON-NEXT:    .param .align 4 .b8 param0[4];
+; COMMON-NEXT:    st.param.b32 [param0], %r1;
+; COMMON-NEXT:    .param .align 4 .b8 param1[4];
+; COMMON-NEXT:    st.param.b32 [param1], %r2;
+; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
+; COMMON-NEXT:    call.uni (retval0),
+; COMMON-NEXT:    test_callee,
+; COMMON-NEXT:    (
+; COMMON-NEXT:    param0,
+; COMMON-NEXT:    param1
+; COMMON-NEXT:    );
+; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
+; COMMON-NEXT:    } // callseq 0
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %r = call <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b)
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_call_flipped(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_call_flipped_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_call_flipped_param_1];
-; COMMON:      {
-; COMMON-DAG:  .param .align 4 .b8 param0[4];
-; COMMON-DAG:  .param .align 4 .b8 param1[4];
-; COMMON-DAG:  st.param.b32    [param0], [[B]];
-; COMMON-DAG:  st.param.b32    [param1], [[A]];
-; COMMON-DAG:  .param .align 4 .b8 retval0[4];
-; COMMON:      call.uni (retval0),
-; COMMON-NEXT:        test_callee,
-; COMMON:      );
-; COMMON-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0];
-; COMMON-NEXT: }
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
+; COMMON-LABEL: test_call_flipped(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<5>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r2, [test_call_flipped_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_call_flipped_param_0];
+; COMMON-NEXT:    { // callseq 1, 0
+; COMMON-NEXT:    .param .align 4 .b8 param0[4];
+; COMMON-NEXT:    st.param.b32 [param0], %r2;
+; COMMON-NEXT:    .param .align 4 .b8 param1[4];
+; COMMON-NEXT:    st.param.b32 [param1], %r1;
+; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
+; COMMON-NEXT:    call.uni (retval0),
+; COMMON-NEXT:    test_callee,
+; COMMON-NEXT:    (
+; COMMON-NEXT:    param0,
+; COMMON-NEXT:    param1
+; COMMON-NEXT:    );
+; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
+; COMMON-NEXT:    } // callseq 1
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %r = call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a)
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_tailcall_flipped(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_tailcall_flipped_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_tailcall_flipped_param_1];
-; COMMON:      {
-; COMMON-DAG:  .param .align 4 .b8 param0[4];
-; COMMON-DAG:  .param .align 4 .b8 param1[4];
-; COMMON-DAG:  st.param.b32    [param0], [[B]];
-; COMMON-DAG:  st.param.b32    [param1], [[A]];
-; COMMON-DAG:  .param .align 4 .b8 retval0[4];
-; COMMON:      call.uni (retval0),
-; COMMON-NEXT:        test_callee,
-; COMMON:      );
-; COMMON-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0];
-; COMMON-NEXT: }
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
+; COMMON-LABEL: test_tailcall_flipped(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<5>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r2, [test_tailcall_flipped_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_tailcall_flipped_param_0];
+; COMMON-NEXT:    { // callseq 2, 0
+; COMMON-NEXT:    .param .align 4 .b8 param0[4];
+; COMMON-NEXT:    st.param.b32 [param0], %r2;
+; COMMON-NEXT:    .param .align 4 .b8 param1[4];
+; COMMON-NEXT:    st.param.b32 [param1], %r1;
+; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
+; COMMON-NEXT:    call.uni (retval0),
+; COMMON-NEXT:    test_callee,
+; COMMON-NEXT:    (
+; COMMON-NEXT:    param0,
+; COMMON-NEXT:    param1
+; COMMON-NEXT:    );
+; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
+; COMMON-NEXT:    } // callseq 2
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %r = tail call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a)
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_select(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_select_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_select_param_1];
-; COMMON-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
-; COMMON-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; COMMON-NEXT: selp.b32        [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]];
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 {
+; COMMON-LABEL: test_select(
+; COMMON:       {
+; COMMON-NEXT:    .reg .pred %p<2>;
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
+; COMMON-NEXT:    and.b16 %rs2, %rs1, 1;
+; COMMON-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; COMMON-NEXT:    ld.param.u32 %r2, [test_select_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_select_param_0];
+; COMMON-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %r = select i1 %c, <2 x i16> %a, <2 x i16> %b
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_select_cc(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_select_cc_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_select_cc_param_1];
-; COMMON-DAG:  ld.param.u32    [[C:%r[0-9]+]], [test_select_cc_param_2];
-; COMMON-DAG:  ld.param.u32    [[D:%r[0-9]+]], [test_select_cc_param_3];
-; COMMON-DAG:  mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; COMMON-DAG:  mov.b32        {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
-; COMMON-DAG:  setp.ne.s16    [[P0:%p[0-9]+]], [[C0]], [[D0]]
-; COMMON-DAG:  setp.ne.s16    [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; COMMON-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; COMMON-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; COMMON-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
-; COMMON-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
-; COMMON:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) #0 {
+; COMMON-LABEL: test_select_cc(
+; COMMON:       {
+; COMMON-NEXT:    .reg .pred %p<3>;
+; COMMON-NEXT:    .reg .b16 %rs<11>;
+; COMMON-NEXT:    .reg .b32 %r<6>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r4, [test_select_cc_param_3];
+; COMMON-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
+; COMMON-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; COMMON-NEXT:    setp.ne.s16 %p1, %rs3, %rs1;
+; COMMON-NEXT:    setp.ne.s16 %p2, %rs4, %rs2;
+; COMMON-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; COMMON-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
+; COMMON-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
+; COMMON-NEXT:    mov.b32 %r5, {%rs10, %rs9};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r5;
+; COMMON-NEXT:    ret;
   %cc = icmp ne <2 x i16> %c, %d
   %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_select_cc_i32_i16(
-; COMMON-DAG:  ld.param.v2.u32    {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_0];
-; COMMON-DAG:  ld.param.v2.u32    {[[B0:%r[0-9]+]], [[B1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_1];
-; COMMON-DAG:  ld.param.u32    [[C:%r[0-9]+]], [test_select_cc_i32_i16_param_2];
-; COMMON-DAG:  ld.param.u32    [[D:%r[0-9]+]], [test_select_cc_i32_i16_param_3];
-; COMMON-DAG: mov.b32         {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; COMMON-DAG: mov.b32         {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
-; COMMON-DAG: setp.ne.s16    [[P0:%p[0-9]+]], [[C0]], [[D0]]
-; COMMON-DAG: setp.ne.s16    [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; COMMON-DAG: selp.b32        [[R0:%r[0-9]+]], [[A0]], [[B0]], [[P0]];
-; COMMON-DAG: selp.b32        [[R1:%r[0-9]+]], [[A1]], [[B1]], [[P1]];
-; COMMON-NEXT: st.param.v2.b32    [func_retval0], {[[R0]], [[R1]]};
-; COMMON-NEXT: ret;
 define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b,
+; COMMON-LABEL: test_select_cc_i32_i16(
+; COMMON:       {
+; COMMON-NEXT:    .reg .pred %p<3>;
+; COMMON-NEXT:    .reg .b16 %rs<5>;
+; COMMON-NEXT:    .reg .b32 %r<9>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
+; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
+; COMMON-NEXT:    ld.param.u32 %r6, [test_select_cc_i32_i16_param_3];
+; COMMON-NEXT:    ld.param.u32 %r5, [test_select_cc_i32_i16_param_2];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r5;
+; COMMON-NEXT:    setp.ne.s16 %p1, %rs3, %rs1;
+; COMMON-NEXT:    setp.ne.s16 %p2, %rs4, %rs2;
+; COMMON-NEXT:    selp.b32 %r7, %r2, %r4, %p2;
+; COMMON-NEXT:    selp.b32 %r8, %r1, %r3, %p1;
+; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; COMMON-NEXT:    ret;
                                            <2 x i16> %c, <2 x i16> %d) #0 {
   %cc = icmp ne <2 x i16> %c, %d
   %r = select <2 x i1> %cc, <2 x i32> %a, <2 x i32> %b
   ret <2 x i32> %r
 }
 
-; COMMON-LABEL: test_select_cc_i16_i32(
-; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_select_cc_i16_i32_param_0];
-; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_select_cc_i16_i32_param_1];
-; COMMON-DAG:  ld.param.v2.u32 {[[C0:%r[0-9]+]], [[C1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_2];
-; COMMON-DAG:  ld.param.v2.u32 {[[D0:%r[0-9]+]], [[D1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_3];
-; COMMON-DAG:  setp.ne.s32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
-; COMMON-DAG:  setp.ne.s32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; COMMON-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; COMMON-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; COMMON-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
-; COMMON-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
-; COMMON:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; COMMON-NEXT: st.param.b32    [func_retval0], [[R]];
-; COMMON-NEXT: ret;
 define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
+; COMMON-LABEL: test_select_cc_i16_i32(
+; COMMON:       {
+; COMMON-NEXT:    .reg .pred %p<3>;
+; COMMON-NEXT:    .reg .b16 %rs<7>;
+; COMMON-NEXT:    .reg .b32 %r<8>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.v2.u32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
+; COMMON-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
+; COMMON-NEXT:    ld.param.u32 %r2, [test_select_cc_i16_i32_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_select_cc_i16_i32_param_0];
+; COMMON-NEXT:    setp.ne.s32 %p1, %r3, %r5;
+; COMMON-NEXT:    setp.ne.s32 %p2, %r4, %r6;
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; COMMON-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
+; COMMON-NEXT:    mov.b32 %r7, {%rs6, %rs5};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r7;
+; COMMON-NEXT:    ret;
                                           <2 x i32> %c, <2 x i32> %d) #0 {
   %cc = icmp ne <2 x i32> %c, %d
   %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b
@@ -575,79 +804,114 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
 }
 
 
-; COMMON-LABEL: test_trunc_2xi32(
-; COMMON:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_trunc_2xi32_param_0];
-; COMMON-DAG:  cvt.u16.u32  [[R0:%rs[0-9]+]], [[A0]];
-; COMMON-DAG:  cvt.u16.u32  [[R1:%rs[0-9]+]], [[A1]];
-; COMMON:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; COMMON:      st.param.b32    [func_retval0], [[R]];
-; COMMON:      ret;
 define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
+; COMMON-LABEL: test_trunc_2xi32(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0];
+; COMMON-NEXT:    cvt.u16.u32 %rs1, %r2;
+; COMMON-NEXT:    cvt.u16.u32 %rs2, %r1;
+; COMMON-NEXT:    mov.b32 %r3, {%rs2, %rs1};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
+; COMMON-NEXT:    ret;
   %r = trunc <2 x i32> %a to <2 x i16>
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_trunc_2xi64(
-; COMMON:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_trunc_2xi64_param_0];
-; COMMON-DAG:  cvt.u16.u64  [[R0:%rs[0-9]+]], [[A0]];
-; COMMON-DAG:  cvt.u16.u64  [[R1:%rs[0-9]+]], [[A1]];
-; COMMON:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; COMMON:      st.param.b32    [func_retval0], [[R]];
-; COMMON:      ret;
 define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 {
+; COMMON-LABEL: test_trunc_2xi64(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<2>;
+; COMMON-NEXT:    .reg .b64 %rd<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; COMMON-NEXT:    cvt.u16.u64 %rs1, %rd2;
+; COMMON-NEXT:    cvt.u16.u64 %rs2, %rd1;
+; COMMON-NEXT:    mov.b32 %r1, {%rs2, %rs1};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
+; COMMON-NEXT:    ret;
   %r = trunc <2 x i64> %a to <2 x i16>
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_zext_2xi32(
-; COMMON:      ld.param.u32    [[A:%r[0-9]+]], [test_zext_2xi32_param_0];
-; COMMON:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; COMMON-DAG:  cvt.u32.u16     [[R0:%r[0-9]+]], [[A0]];
-; COMMON-DAG:  cvt.u32.u16     [[R1:%r[0-9]+]], [[A1]];
-; COMMON-NEXT: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]};
-; COMMON:      ret;
 define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_zext_2xi32(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_zext_2xi32_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
+; COMMON-NEXT:    cvt.u32.u16 %r3, %rs2;
+; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r3};
+; COMMON-NEXT:    ret;
   %r = zext <2 x i16> %a to <2 x i32>
   ret <2 x i32> %r
 }
 
-; COMMON-LABEL: test_zext_2xi64(
-; COMMON:      ld.param.u32    [[A:%r[0-9]+]], [test_zext_2xi64_param_0];
-; COMMON:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; COMMON-DAG:  cvt.u64.u16     [[R0:%rd[0-9]+]], [[A0]];
-; COMMON-DAG:  cvt.u64.u16     [[R1:%rd[0-9]+]], [[A1]];
-; COMMON-NEXT: st.param.v2.b64 [func_retval0], {[[R0]], [[R1]]};
-; COMMON:      ret;
 define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_zext_2xi64(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<2>;
+; COMMON-NEXT:    .reg .b64 %rd<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_zext_2xi64_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    cvt.u64.u16 %rd1, %rs2;
+; COMMON-NEXT:    cvt.u64.u16 %rd2, %rs1;
+; COMMON-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; COMMON-NEXT:    ret;
   %r = zext <2 x i16> %a to <2 x i64>
   ret <2 x i64> %r
 }
 
-; COMMON-LABEL: test_bitcast_i32_to_2xi16(
-; COMMON: ld.param.u32 	[[R:%r[0-9]+]], [test_bitcast_i32_to_2xi16_param_0];
-; COMMON: st.param.b32 	[func_retval0], [[R]];
-; COMMON: ret;
 define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 {
+; COMMON-LABEL: test_bitcast_i32_to_2xi16(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0];
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
+; COMMON-NEXT:    ret;
   %r = bitcast i32 %a to <2 x i16>
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_bitcast_2xi16_to_i32(
-; COMMON: ld.param.u32 	[[R:%r[0-9]+]], [test_bitcast_2xi16_to_i32_param_0];
-; COMMON: st.param.b32 	[func_retval0], [[R]];
-; COMMON: ret;
 define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_bitcast_2xi16_to_i32(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r2, [test_bitcast_2xi16_to_i32_param_0];
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = bitcast <2 x i16> %a to i32
   ret i32 %r
 }
 
-; COMMON-LABEL: test_bitcast_2xi16_to_2xhalf(
-; COMMON: ld.param.u16 	[[RS1:%rs[0-9]+]], [test_bitcast_2xi16_to_2xhalf_param_0];
-; COMMON:	mov.u16 	[[RS2:%rs[0-9]+]], 5;
-; COMMON:	mov.b32 	[[R:%r[0-9]+]], {[[RS1]], [[RS2]]};
-; COMMON: st.param.b32 	[func_retval0], [[R]];
-; COMMON: ret;
 define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 {
+; COMMON-LABEL: test_bitcast_2xi16_to_2xhalf(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0];
+; COMMON-NEXT:    mov.u16 %rs2, 5;
+; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
+; COMMON-NEXT:    ret;
   %ins.0 = insertelement <2 x i16> undef, i16 %a, i32 0
   %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
   %r = bitcast <2 x i16> %ins.1 to <2 x half>
@@ -655,43 +919,71 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 {
 }
 
 
-; COMMON-LABEL: test_shufflevector(
-; COMMON:	ld.param.u32 	[[R:%r[0-9]+]], [test_shufflevector_param_0];
-; COMMON:	mov.b32 	{[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[R]];
-; COMMON:	mov.b32 	[[R1:%r[0-9]+]], {[[RS1]], [[RS0]]};
-; COMMON:	st.param.b32 	[func_retval0], [[R1]];
-; COMMON:	ret;
 define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
+; COMMON-LABEL: test_shufflevector(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u32 %r1, [test_shufflevector_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    mov.b32 %r2, {%rs2, %rs1};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   ret <2 x i16> %s
 }
 
-; COMMON-LABEL: test_insertelement(
-; COMMON:  ld.param.u16 	[[B:%rs[0-9]+]], [test_insertelement_param_1];
-; COMMON:	ld.param.u32 	[[A:%r[0-9]+]], [test_insertelement_param_0];
-; COMMON:	{ .reg .b16 tmp; mov.b32 {[[R0:%rs[0-9]+]], tmp}, [[A]]; }
-; COMMON:	mov.b32 	[[R1:%r[0-9]+]], {[[R0]], [[B]]};
-; COMMON:	st.param.b32 	[func_retval0], [[R1]];
-; COMMON:	ret;
 define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
+; COMMON-LABEL: test_insertelement(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.u16 %rs1, [test_insertelement_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_insertelement_param_0];
+; COMMON-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
+; COMMON-NEXT:    mov.b32 %r2, {%rs2, %rs1};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %i = insertelement <2 x i16> %a, i16 %x, i64 1
   ret <2 x i16> %i
 }
 
-; COMMON-LABEL: test_fptosi_2xhalf_to_2xi16(
-; COMMON:      cvt.rzi.s16.f16
-; COMMON:      cvt.rzi.s16.f16
-; COMMON:      ret;
 define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 {
+; COMMON-LABEL: test_fptosi_2xhalf_to_2xi16(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<5>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
+; COMMON-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
+; COMMON-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = fptosi <2 x half> %a to <2 x i16>
   ret <2 x i16> %r
 }
 
-; COMMON-LABEL: test_fptoui_2xhalf_to_2xi16(
-; COMMON:      cvt.rzi.u16.f16
-; COMMON:      cvt.rzi.u16.f16
-; COMMON:      ret;
 define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 {
+; COMMON-LABEL: test_fptoui_2xhalf_to_2xi16(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<5>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
+; COMMON-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
+; COMMON-NEXT:    mov.b32 %r2, {%rs4, %rs3};
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %r = fptoui <2 x half> %a to <2 x i16>
   ret <2 x i16> %r
 }
diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
index df9c3e59b0e6b..e9662dd8a7fa3 100644
--- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80        \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
 ; RUN: | FileCheck  %s
 ; RUN: %if ptxas %{                                                           \
@@ -9,25 +10,37 @@
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
-; CHECK-LABEL: test_bitcast_2xi8_i16(
-; CHECK: ld.param.u32 	%r1, [test_bitcast_2xi8_i16_param_0];
-; CHECK: mov.b32 	{%rs1, %rs2}, %r1;
-; CHECK: shl.b16 	%rs3, %rs2, 8;
-; CHECK: and.b16  	%rs4, %rs1, 255;
-; CHECK: or.b16  	%rs5, %rs4, %rs3;
-; CHECK: cvt.u32.u16 	%r2, %rs5;
-; CHECK: st.param.b32 	[func_retval0], %r2;
 define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) {
+; CHECK-LABEL: test_bitcast_2xi8_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<6>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    shl.b16 %rs3, %rs2, 8;
+; CHECK-NEXT:    and.b16 %rs4, %rs1, 255;
+; CHECK-NEXT:    or.b16 %rs5, %rs4, %rs3;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %res = bitcast <2 x i8> %a to i16
   ret i16 %res
 }
 
-; CHECK-LABEL: test_bitcast_i16_2xi8(
-; CHECK: ld.param.u16 	%rs1, [test_bitcast_i16_2xi8_param_0];
-; CHECK: shr.u16 	%rs2, %rs1, 8;
-; CHECK: mov.b32 	%r1, {%rs1, %rs2};
-; CHECK: st.param.b32 	[func_retval0], %r1;
 define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) {
+; CHECK-LABEL: test_bitcast_i16_2xi8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0];
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, 8;
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %res = bitcast i16 %a to <2 x i8>
   ret <2 x i8> %res
 }
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
new file mode 100644
index 0000000000000..e036753ce9030
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O3 -march=nvptx64 -enable-misched %s -o - | FileCheck %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: wombat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<11>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .f64 %fd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %bb
+; CHECK-NEXT:    ld.param.u32 %r4, [wombat_param_2];
+; CHECK-NEXT:    ld.param.u32 %r3, [wombat_param_1];
+; CHECK-NEXT:    ld.param.u32 %r2, [wombat_param_0];
+; CHECK-NEXT:    mov.b32 %r10, 0;
+; CHECK-NEXT:    mov.u64 %rd1, 0;
+; CHECK-NEXT:    mov.b32 %r6, 1;
+; CHECK-NEXT:  $L__BB0_1: // %bb3
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    st.param.f64 [param0], 0d0000000000000000;
+; CHECK-NEXT:    .param .b64 retval0;
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    quux,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
+; CHECK-NEXT:    or.b32 %r8, %r4, %r7;
+; CHECK-NEXT:    mul.lo.s32 %r9, %r2, %r8;
+; CHECK-NEXT:    cvt.rn.f64.s32 %fd3, %r9;
+; CHECK-NEXT:    ld.param.f64 %fd1, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    cvt.rn.f64.u32 %fd4, %r10;
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, %fd3;
+; CHECK-NEXT:    st.global.f64 [%rd1], %fd5;
+; CHECK-NEXT:    mov.u32 %r10, %r6;
+; CHECK-NEXT:    bra.uni $L__BB0_1;
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb
+  %phi = phi i32 [ 0, %bb ], [ 1, %bb3 ]
+  %call = tail call double @quux(double 0.000000e+00)
+  %mul = mul i32 %phi, %arg1
+  %or = or i32 %arg2, %mul
+  %mul4 = mul i32 %arg, %or
+  %sitofp = sitofp i32 %mul4 to double
+  %uitofp = uitofp i32 %phi to double
+  %fadd = fadd double %uitofp, %sitofp
+  store double %fadd, ptr addrspace(1) null, align 8
+  br label %bb3
+}
+
+declare double @quux(double)
diff --git a/llvm/test/CodeGen/PowerPC/global-merge-aix-sections.ll b/llvm/test/CodeGen/PowerPC/global-merge-aix-sections.ll
new file mode 100644
index 0000000000000..0fb313b572677
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/global-merge-aix-sections.ll
@@ -0,0 +1,52 @@
+; RUN: rm -rf %t
+; RUN: mkdir -p %t
+
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr8 < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr8 --filetype=obj -o %t/global-merge-aix-sections.o < %s
+; RUN; llvm-objdump --syms %t/global-merge-aix-sections.o | FileCheck %s --check-prefix=DATA
+
+%struct.Example = type { i32, i8 }
+
+@y = internal global i32 0, section "mycsect", align 4
+@z = internal global i32 0, section "mycsect", align 4
+@l = internal global i32 0, align 4
+@u = internal global i16 0, section "mycsect", align 2
+@myStruct1 = internal global %struct.Example zeroinitializer, section "mycsect", align 4
+
+; Function Attrs: nounwind
+define void @g() {
+entry:
+  tail call void @f(ptr noundef nonnull @y, ptr noundef nonnull @z)
+  tail call void @f(ptr noundef nonnull @l, ptr noundef nonnull @z)
+  tail call void @h(ptr noundef nonnull @u)
+  tail call void @s(ptr noundef nonnull @myStruct1)
+  ret void
+}
+
+declare void @f(ptr noundef, ptr noundef)
+declare void @h(ptr noundef)
+declare void @s(ptr noundef)
+
+; CHECK: .csect mycsect[RW],2
+; CHECK-NEXT: .lglobl u # @_MergedGlobals
+; CHECK-NEXT: .lglobl y
+; CHECK-NEXT: .lglobl z
+; CHECK-NEXT: .lglobl myStruct1
+; CHECK-NEXT: .align  2
+; CHECK-NEXT: L.._MergedGlobals:
+; CHECK-NEXT: u:
+; CHECK-NEXT:        .space  2
+; CHECK-NEXT:        .space  2
+; CHECK-NEXT: y:
+; CHECK-NEXT:        .space  4
+; CHECK-NEXT: z:
+; CHECK-NEXT:        .space  4
+; CHECK-NEXT: myStruct1:
+; CHECK-NEXT:        .space  8
+
+; DATA: 00000078 l     O .data  00000014 mycsect
+; DATA-NEXT: 00000078 l     O .data (csect: mycsect)         00000000 u
+; DATA-NEXT: 0000007c l     O .data (csect: mycsect)         00000000 y
+; DATA-NEXT: 00000080 l     O .data (csect: mycsect)         00000000 z
+; DATA-NEXT: 00000084 l     O .data (csect: mycsect)         00000000 myStruct1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir
deleted file mode 100644
index d0237892d132f..0000000000000
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir
+++ /dev/null
@@ -1,160 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=riscv64 -run-pass=instruction-select %s -o - \
-# RUN:   -code-model=medium | FileCheck %s
-
---- |
-  define i32 @jt_test(i32 signext %in) {
-  entry:
-    %0 = sext i32 %in to i64
-    switch i64 %0, label %default [
-      i64 1, label %bb1
-      i64 2, label %bb2
-      i64 3, label %bb3
-      i64 4, label %bb4
-      i64 5, label %bb5
-      i64 6, label %bb6
-    ]
-
-  bb1:                                              ; preds = %entry
-    ret i32 4
-
-  bb2:                                              ; preds = %entry
-    ret i32 3
-
-  bb3:                                              ; preds = %entry
-    ret i32 2
-
-  bb4:                                              ; preds = %entry
-    ret i32 1
-
-  bb5:                                              ; preds = %entry
-    ret i32 100
-
-  bb6:                                              ; preds = %entry
-    ret i32 200
-
-  default:                                          ; preds = %entry
-    ret i32 1000
-  }
-
-...
----
-name:            jt_test
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-jumpTable:
-  kind:            block-address
-  entries:
-    - id:              0
-      blocks:          [ '%bb.2', '%bb.3', '%bb.4', '%bb.5', '%bb.6', '%bb.7' ]
-body:             |
-  ; CHECK-LABEL: name: jt_test
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 5
-  ; CHECK-NEXT:   [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 0
-  ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI [[ADDIW]], -1
-  ; CHECK-NEXT:   BLTU [[ADDI]], [[ADDI1]], %bb.8
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1.entry:
-  ; CHECK-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA %jump-table.0
-  ; CHECK-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[ADDI1]], 3
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[PseudoLLA]], [[SLLI]]
-  ; CHECK-NEXT:   [[LD:%[0-9]+]]:gprjalr = LD [[ADD]], 0 :: (load (s64) from jump-table)
-  ; CHECK-NEXT:   PseudoBRIND [[LD]], 0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2.bb1:
-  ; CHECK-NEXT:   [[ADDI2:%[0-9]+]]:gpr = ADDI $x0, 4
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI2]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3.bb2:
-  ; CHECK-NEXT:   [[ADDI3:%[0-9]+]]:gpr = ADDI $x0, 3
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI3]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4.bb3:
-  ; CHECK-NEXT:   [[ADDI4:%[0-9]+]]:gpr = ADDI $x0, 2
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI4]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5.bb4:
-  ; CHECK-NEXT:   [[ADDI5:%[0-9]+]]:gpr = ADDI $x0, 1
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI5]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.6.bb5:
-  ; CHECK-NEXT:   [[ADDI6:%[0-9]+]]:gpr = ADDI $x0, 100
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI6]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.7.bb6:
-  ; CHECK-NEXT:   [[ADDI7:%[0-9]+]]:gpr = ADDI $x0, 200
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI7]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.8.default:
-  ; CHECK-NEXT:   [[ADDI8:%[0-9]+]]:gpr = ADDI $x0, 1000
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI8]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  bb.1.entry:
-    successors: %bb.8, %bb.9
-    liveins: $x10
-
-    %1:gprb(s64) = COPY $x10
-    %2:gprb(s64) = G_ASSERT_SEXT %1, 32
-    %7:gprb(s64) = G_CONSTANT i64 5
-    %3:gprb(s64) = G_SEXT_INREG %2, 32
-    %4:gprb(s64) = G_CONSTANT i64 -1
-    %5:gprb(s64) = G_ADD %3, %4
-    %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7
-    G_BRCOND %26(s64), %bb.8
-
-  bb.9.entry:
-    successors: %bb.2, %bb.3, %bb.4, %bb.5, %bb.6, %bb.7
-
-    %10:gprb(p0) = G_JUMP_TABLE %jump-table.0
-    G_BRJT %10(p0), %jump-table.0, %5(s64)
-
-  bb.2.bb1:
-    %22:gprb(s64) = G_CONSTANT i64 4
-    $x10 = COPY %22(s64)
-    PseudoRET implicit $x10
-
-  bb.3.bb2:
-    %20:gprb(s64) = G_CONSTANT i64 3
-    $x10 = COPY %20(s64)
-    PseudoRET implicit $x10
-
-  bb.4.bb3:
-    %18:gprb(s64) = G_CONSTANT i64 2
-    $x10 = COPY %18(s64)
-    PseudoRET implicit $x10
-
-  bb.5.bb4:
-    %16:gprb(s64) = G_CONSTANT i64 1
-    $x10 = COPY %16(s64)
-    PseudoRET implicit $x10
-
-  bb.6.bb5:
-    %14:gprb(s64) = G_CONSTANT i64 100
-    $x10 = COPY %14(s64)
-    PseudoRET implicit $x10
-
-  bb.7.bb6:
-    %12:gprb(s64) = G_CONSTANT i64 200
-    $x10 = COPY %12(s64)
-    PseudoRET implicit $x10
-
-  bb.8.default:
-    %24:gprb(s64) = G_CONSTANT i64 1000
-    $x10 = COPY %24(s64)
-    PseudoRET implicit $x10
-
-...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir
deleted file mode 100644
index 0a08586bc1af4..0000000000000
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir
+++ /dev/null
@@ -1,161 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=riscv64 -run-pass=instruction-select %s -o - \
-# RUN:   -relocation-model=pic | FileCheck %s
-
---- |
-  define i32 @jt_test(i32 signext %in) {
-  entry:
-    %0 = sext i32 %in to i64
-    switch i64 %0, label %default [
-      i64 1, label %bb1
-      i64 2, label %bb2
-      i64 3, label %bb3
-      i64 4, label %bb4
-      i64 5, label %bb5
-      i64 6, label %bb6
-    ]
-
-  bb1:                                              ; preds = %entry
-    ret i32 4
-
-  bb2:                                              ; preds = %entry
-    ret i32 3
-
-  bb3:                                              ; preds = %entry
-    ret i32 2
-
-  bb4:                                              ; preds = %entry
-    ret i32 1
-
-  bb5:                                              ; preds = %entry
-    ret i32 100
-
-  bb6:                                              ; preds = %entry
-    ret i32 200
-
-  default:                                          ; preds = %entry
-    ret i32 1000
-  }
-
-...
----
-name:            jt_test
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-jumpTable:
-  kind:            label-difference32
-  entries:
-    - id:              0
-      blocks:          [ '%bb.2', '%bb.3', '%bb.4', '%bb.5', '%bb.6', '%bb.7' ]
-body:             |
-  ; CHECK-LABEL: name: jt_test
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 5
-  ; CHECK-NEXT:   [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 0
-  ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI [[ADDIW]], -1
-  ; CHECK-NEXT:   BLTU [[ADDI]], [[ADDI1]], %bb.8
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1.entry:
-  ; CHECK-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA %jump-table.0
-  ; CHECK-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[ADDI1]], 2
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[PseudoLLA]], [[SLLI]]
-  ; CHECK-NEXT:   [[LW:%[0-9]+]]:gpr = LW [[ADD]], 0 :: (load (s32) from jump-table)
-  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:gprjalr = ADD [[LW]], [[PseudoLLA]]
-  ; CHECK-NEXT:   PseudoBRIND [[ADD1]], 0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2.bb1:
-  ; CHECK-NEXT:   [[ADDI2:%[0-9]+]]:gpr = ADDI $x0, 4
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI2]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3.bb2:
-  ; CHECK-NEXT:   [[ADDI3:%[0-9]+]]:gpr = ADDI $x0, 3
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI3]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4.bb3:
-  ; CHECK-NEXT:   [[ADDI4:%[0-9]+]]:gpr = ADDI $x0, 2
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI4]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5.bb4:
-  ; CHECK-NEXT:   [[ADDI5:%[0-9]+]]:gpr = ADDI $x0, 1
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI5]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.6.bb5:
-  ; CHECK-NEXT:   [[ADDI6:%[0-9]+]]:gpr = ADDI $x0, 100
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI6]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.7.bb6:
-  ; CHECK-NEXT:   [[ADDI7:%[0-9]+]]:gpr = ADDI $x0, 200
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI7]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.8.default:
-  ; CHECK-NEXT:   [[ADDI8:%[0-9]+]]:gpr = ADDI $x0, 1000
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI8]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  bb.1.entry:
-    successors: %bb.8, %bb.9
-    liveins: $x10
-
-    %1:gprb(s64) = COPY $x10
-    %2:gprb(s64) = G_ASSERT_SEXT %1, 32
-    %7:gprb(s64) = G_CONSTANT i64 5
-    %3:gprb(s64) = G_SEXT_INREG %2, 32
-    %4:gprb(s64) = G_CONSTANT i64 -1
-    %5:gprb(s64) = G_ADD %3, %4
-    %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7
-    G_BRCOND %26(s64), %bb.8
-
-  bb.9.entry:
-    successors: %bb.2, %bb.3, %bb.4, %bb.5, %bb.6, %bb.7
-
-    %10:gprb(p0) = G_JUMP_TABLE %jump-table.0
-    G_BRJT %10(p0), %jump-table.0, %5(s64)
-
-  bb.2.bb1:
-    %22:gprb(s64) = G_CONSTANT i64 4
-    $x10 = COPY %22(s64)
-    PseudoRET implicit $x10
-
-  bb.3.bb2:
-    %20:gprb(s64) = G_CONSTANT i64 3
-    $x10 = COPY %20(s64)
-    PseudoRET implicit $x10
-
-  bb.4.bb3:
-    %18:gprb(s64) = G_CONSTANT i64 2
-    $x10 = COPY %18(s64)
-    PseudoRET implicit $x10
-
-  bb.5.bb4:
-    %16:gprb(s64) = G_CONSTANT i64 1
-    $x10 = COPY %16(s64)
-    PseudoRET implicit $x10
-
-  bb.6.bb5:
-    %14:gprb(s64) = G_CONSTANT i64 100
-    $x10 = COPY %14(s64)
-    PseudoRET implicit $x10
-
-  bb.7.bb6:
-    %12:gprb(s64) = G_CONSTANT i64 200
-    $x10 = COPY %12(s64)
-    PseudoRET implicit $x10
-
-  bb.8.default:
-    %24:gprb(s64) = G_CONSTANT i64 1000
-    $x10 = COPY %24(s64)
-    PseudoRET implicit $x10
-
-...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir
deleted file mode 100644
index efa1a6c86027d..0000000000000
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir
+++ /dev/null
@@ -1,213 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=riscv32 -mattr=+m -run-pass=instruction-select %s -o - \
-# RUN:   -code-model=small | FileCheck %s --check-prefix=RV32-SMALL
-# RUN: llc -mtriple=riscv32 -mattr=+m -run-pass=instruction-select %s -o - \
-# RUN:   -code-model=medium | FileCheck %s --check-prefix=RV32-MEDIUM
-
---- |
-  define i32 @jt_test(i32 signext %in) {
-  entry:
-    switch i32 %in, label %default [
-      i32 1, label %bb1
-      i32 2, label %bb2
-      i32 3, label %bb3
-      i32 4, label %bb4
-      i32 5, label %bb5
-      i32 6, label %bb6
-    ]
-
-  bb1:                                              ; preds = %entry
-    ret i32 4
-
-  bb2:                                              ; preds = %entry
-    ret i32 3
-
-  bb3:                                              ; preds = %entry
-    ret i32 2
-
-  bb4:                                              ; preds = %entry
-    ret i32 1
-
-  bb5:                                              ; preds = %entry
-    ret i32 100
-
-  bb6:                                              ; preds = %entry
-    ret i32 200
-
-  default:                                          ; preds = %entry
-    ret i32 1000
-  }
-
-...
----
-name:            jt_test
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-jumpTable:
-  kind:            block-address
-  entries:
-    - id:              0
-      blocks:          [ '%bb.2', '%bb.3', '%bb.4', '%bb.5', '%bb.6', '%bb.7' ]
-body:             |
-  ; RV32-SMALL-LABEL: name: jt_test
-  ; RV32-SMALL: bb.0.entry:
-  ; RV32-SMALL-NEXT:   successors: %bb.8(0x40000000), %bb.1(0x40000000)
-  ; RV32-SMALL-NEXT:   liveins: $x10
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
-  ; RV32-SMALL-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 5
-  ; RV32-SMALL-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 200
-  ; RV32-SMALL-NEXT:   [[ADDI2:%[0-9]+]]:gpr = ADDI $x0, 100
-  ; RV32-SMALL-NEXT:   [[ADDI3:%[0-9]+]]:gpr = ADDI $x0, 1
-  ; RV32-SMALL-NEXT:   [[ADDI4:%[0-9]+]]:gpr = ADDI $x0, 2
-  ; RV32-SMALL-NEXT:   [[ADDI5:%[0-9]+]]:gpr = ADDI $x0, 3
-  ; RV32-SMALL-NEXT:   [[ADDI6:%[0-9]+]]:gpr = ADDI $x0, 4
-  ; RV32-SMALL-NEXT:   [[ADDI7:%[0-9]+]]:gpr = ADDI $x0, 1000
-  ; RV32-SMALL-NEXT:   [[ADDI8:%[0-9]+]]:gpr = ADDI [[COPY]], -1
-  ; RV32-SMALL-NEXT:   BLTU [[ADDI]], [[ADDI8]], %bb.8
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT: bb.1.entry:
-  ; RV32-SMALL-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT:   [[LUI:%[0-9]+]]:gpr = LUI target-flags(riscv-hi) %jump-table.0
-  ; RV32-SMALL-NEXT:   [[ADDI9:%[0-9]+]]:gpr = ADDI [[LUI]], target-flags(riscv-lo) %jump-table.0
-  ; RV32-SMALL-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[ADDI8]], 2
-  ; RV32-SMALL-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[ADDI9]], [[SLLI]]
-  ; RV32-SMALL-NEXT:   [[LW:%[0-9]+]]:gprjalr = LW [[ADD]], 0 :: (load (s32) from jump-table)
-  ; RV32-SMALL-NEXT:   PseudoBRIND [[LW]], 0
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT: bb.2.bb1:
-  ; RV32-SMALL-NEXT:   $x10 = COPY [[ADDI6]]
-  ; RV32-SMALL-NEXT:   PseudoRET implicit $x10
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT: bb.3.bb2:
-  ; RV32-SMALL-NEXT:   $x10 = COPY [[ADDI5]]
-  ; RV32-SMALL-NEXT:   PseudoRET implicit $x10
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT: bb.4.bb3:
-  ; RV32-SMALL-NEXT:   $x10 = COPY [[ADDI4]]
-  ; RV32-SMALL-NEXT:   PseudoRET implicit $x10
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT: bb.5.bb4:
-  ; RV32-SMALL-NEXT:   $x10 = COPY [[ADDI3]]
-  ; RV32-SMALL-NEXT:   PseudoRET implicit $x10
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT: bb.6.bb5:
-  ; RV32-SMALL-NEXT:   $x10 = COPY [[ADDI2]]
-  ; RV32-SMALL-NEXT:   PseudoRET implicit $x10
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT: bb.7.bb6:
-  ; RV32-SMALL-NEXT:   $x10 = COPY [[ADDI1]]
-  ; RV32-SMALL-NEXT:   PseudoRET implicit $x10
-  ; RV32-SMALL-NEXT: {{  $}}
-  ; RV32-SMALL-NEXT: bb.8.default:
-  ; RV32-SMALL-NEXT:   $x10 = COPY [[ADDI7]]
-  ; RV32-SMALL-NEXT:   PseudoRET implicit $x10
-  ;
-  ; RV32-MEDIUM-LABEL: name: jt_test
-  ; RV32-MEDIUM: bb.0.entry:
-  ; RV32-MEDIUM-NEXT:   successors: %bb.8(0x40000000), %bb.1(0x40000000)
-  ; RV32-MEDIUM-NEXT:   liveins: $x10
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
-  ; RV32-MEDIUM-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 5
-  ; RV32-MEDIUM-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 200
-  ; RV32-MEDIUM-NEXT:   [[ADDI2:%[0-9]+]]:gpr = ADDI $x0, 100
-  ; RV32-MEDIUM-NEXT:   [[ADDI3:%[0-9]+]]:gpr = ADDI $x0, 1
-  ; RV32-MEDIUM-NEXT:   [[ADDI4:%[0-9]+]]:gpr = ADDI $x0, 2
-  ; RV32-MEDIUM-NEXT:   [[ADDI5:%[0-9]+]]:gpr = ADDI $x0, 3
-  ; RV32-MEDIUM-NEXT:   [[ADDI6:%[0-9]+]]:gpr = ADDI $x0, 4
-  ; RV32-MEDIUM-NEXT:   [[ADDI7:%[0-9]+]]:gpr = ADDI $x0, 1000
-  ; RV32-MEDIUM-NEXT:   [[ADDI8:%[0-9]+]]:gpr = ADDI [[COPY]], -1
-  ; RV32-MEDIUM-NEXT:   BLTU [[ADDI]], [[ADDI8]], %bb.8
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT: bb.1.entry:
-  ; RV32-MEDIUM-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT:   [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA %jump-table.0
-  ; RV32-MEDIUM-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[ADDI8]], 2
-  ; RV32-MEDIUM-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[PseudoLLA]], [[SLLI]]
-  ; RV32-MEDIUM-NEXT:   [[LW:%[0-9]+]]:gprjalr = LW [[ADD]], 0 :: (load (s32) from jump-table)
-  ; RV32-MEDIUM-NEXT:   PseudoBRIND [[LW]], 0
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT: bb.2.bb1:
-  ; RV32-MEDIUM-NEXT:   $x10 = COPY [[ADDI6]]
-  ; RV32-MEDIUM-NEXT:   PseudoRET implicit $x10
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT: bb.3.bb2:
-  ; RV32-MEDIUM-NEXT:   $x10 = COPY [[ADDI5]]
-  ; RV32-MEDIUM-NEXT:   PseudoRET implicit $x10
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT: bb.4.bb3:
-  ; RV32-MEDIUM-NEXT:   $x10 = COPY [[ADDI4]]
-  ; RV32-MEDIUM-NEXT:   PseudoRET implicit $x10
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT: bb.5.bb4:
-  ; RV32-MEDIUM-NEXT:   $x10 = COPY [[ADDI3]]
-  ; RV32-MEDIUM-NEXT:   PseudoRET implicit $x10
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT: bb.6.bb5:
-  ; RV32-MEDIUM-NEXT:   $x10 = COPY [[ADDI2]]
-  ; RV32-MEDIUM-NEXT:   PseudoRET implicit $x10
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT: bb.7.bb6:
-  ; RV32-MEDIUM-NEXT:   $x10 = COPY [[ADDI1]]
-  ; RV32-MEDIUM-NEXT:   PseudoRET implicit $x10
-  ; RV32-MEDIUM-NEXT: {{  $}}
-  ; RV32-MEDIUM-NEXT: bb.8.default:
-  ; RV32-MEDIUM-NEXT:   $x10 = COPY [[ADDI7]]
-  ; RV32-MEDIUM-NEXT:   PseudoRET implicit $x10
-  bb.1.entry:
-    successors: %bb.8, %bb.9
-    liveins: $x10
-
-    %0:gprb(s32) = COPY $x10
-    %4:gprb(s32) = G_CONSTANT i32 5
-    %8:gprb(s32) = G_CONSTANT i32 200
-    %9:gprb(s32) = G_CONSTANT i32 100
-    %10:gprb(s32) = G_CONSTANT i32 1
-    %11:gprb(s32) = G_CONSTANT i32 2
-    %12:gprb(s32) = G_CONSTANT i32 3
-    %13:gprb(s32) = G_CONSTANT i32 4
-    %14:gprb(s32) = G_CONSTANT i32 1000
-    %1:gprb(s32) = G_CONSTANT i32 -1
-    %2:gprb(s32) = G_ADD %0, %1
-    %16:gprb(s32) = G_ICMP intpred(ugt), %2(s32), %4
-    G_BRCOND %16(s32), %bb.8
-
-  bb.9.entry:
-    successors: %bb.2, %bb.3, %bb.4, %bb.5, %bb.6, %bb.7
-
-    %7:gprb(p0) = G_JUMP_TABLE %jump-table.0
-    G_BRJT %7(p0), %jump-table.0, %2(s32)
-
-  bb.2.bb1:
-    $x10 = COPY %13(s32)
-    PseudoRET implicit $x10
-
-  bb.3.bb2:
-    $x10 = COPY %12(s32)
-    PseudoRET implicit $x10
-
-  bb.4.bb3:
-    $x10 = COPY %11(s32)
-    PseudoRET implicit $x10
-
-  bb.5.bb4:
-    $x10 = COPY %10(s32)
-    PseudoRET implicit $x10
-
-  bb.6.bb5:
-    $x10 = COPY %9(s32)
-    PseudoRET implicit $x10
-
-  bb.7.bb6:
-    $x10 = COPY %8(s32)
-    PseudoRET implicit $x10
-
-  bb.8.default:
-    $x10 = COPY %14(s32)
-    PseudoRET implicit $x10
-
-...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir
deleted file mode 100644
index 12b1517e2cfb5..0000000000000
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir
+++ /dev/null
@@ -1,161 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=riscv64 -run-pass=instruction-select %s -o - \
-# RUN:   -code-model=small | FileCheck %s
-
---- |
-  define i32 @jt_test(i32 signext %in) {
-  entry:
-    %0 = sext i32 %in to i64
-    switch i64 %0, label %default [
-      i64 1, label %bb1
-      i64 2, label %bb2
-      i64 3, label %bb3
-      i64 4, label %bb4
-      i64 5, label %bb5
-      i64 6, label %bb6
-    ]
-
-  bb1:                                              ; preds = %entry
-    ret i32 4
-
-  bb2:                                              ; preds = %entry
-    ret i32 3
-
-  bb3:                                              ; preds = %entry
-    ret i32 2
-
-  bb4:                                              ; preds = %entry
-    ret i32 1
-
-  bb5:                                              ; preds = %entry
-    ret i32 100
-
-  bb6:                                              ; preds = %entry
-    ret i32 200
-
-  default:                                          ; preds = %entry
-    ret i32 1000
-  }
-
-...
----
-name:            jt_test
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-jumpTable:
-  kind:            custom32
-  entries:
-    - id:              0
-      blocks:          [ '%bb.2', '%bb.3', '%bb.4', '%bb.5', '%bb.6', '%bb.7' ]
-body:             |
-  ; CHECK-LABEL: name: jt_test
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 5
-  ; CHECK-NEXT:   [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 0
-  ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI [[ADDIW]], -1
-  ; CHECK-NEXT:   BLTU [[ADDI]], [[ADDI1]], %bb.8
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1.entry:
-  ; CHECK-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[LUI:%[0-9]+]]:gpr = LUI target-flags(riscv-hi) %jump-table.0
-  ; CHECK-NEXT:   [[ADDI2:%[0-9]+]]:gpr = ADDI [[LUI]], target-flags(riscv-lo) %jump-table.0
-  ; CHECK-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[ADDI1]], 2
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[ADDI2]], [[SLLI]]
-  ; CHECK-NEXT:   [[LW:%[0-9]+]]:gprjalr = LW [[ADD]], 0 :: (load (s32) from jump-table)
-  ; CHECK-NEXT:   PseudoBRIND [[LW]], 0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2.bb1:
-  ; CHECK-NEXT:   [[ADDI3:%[0-9]+]]:gpr = ADDI $x0, 4
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI3]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3.bb2:
-  ; CHECK-NEXT:   [[ADDI4:%[0-9]+]]:gpr = ADDI $x0, 3
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI4]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4.bb3:
-  ; CHECK-NEXT:   [[ADDI5:%[0-9]+]]:gpr = ADDI $x0, 2
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI5]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5.bb4:
-  ; CHECK-NEXT:   [[ADDI6:%[0-9]+]]:gpr = ADDI $x0, 1
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI6]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.6.bb5:
-  ; CHECK-NEXT:   [[ADDI7:%[0-9]+]]:gpr = ADDI $x0, 100
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI7]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.7.bb6:
-  ; CHECK-NEXT:   [[ADDI8:%[0-9]+]]:gpr = ADDI $x0, 200
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI8]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.8.default:
-  ; CHECK-NEXT:   [[ADDI9:%[0-9]+]]:gpr = ADDI $x0, 1000
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI9]]
-  ; CHECK-NEXT:   PseudoRET implicit $x10
-  bb.1.entry:
-    successors: %bb.8, %bb.9
-    liveins: $x10
-
-    %1:gprb(s64) = COPY $x10
-    %2:gprb(s64) = G_ASSERT_SEXT %1, 32
-    %7:gprb(s64) = G_CONSTANT i64 5
-    %3:gprb(s64) = G_SEXT_INREG %2, 32
-    %4:gprb(s64) = G_CONSTANT i64 -1
-    %5:gprb(s64) = G_ADD %3, %4
-    %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7
-    G_BRCOND %26(s64), %bb.8
-
-  bb.9.entry:
-    successors: %bb.2, %bb.3, %bb.4, %bb.5, %bb.6, %bb.7
-
-    %10:gprb(p0) = G_JUMP_TABLE %jump-table.0
-    G_BRJT %10(p0), %jump-table.0, %5(s64)
-
-  bb.2.bb1:
-    %22:gprb(s64) = G_CONSTANT i64 4
-    $x10 = COPY %22(s64)
-    PseudoRET implicit $x10
-
-  bb.3.bb2:
-    %20:gprb(s64) = G_CONSTANT i64 3
-    $x10 = COPY %20(s64)
-    PseudoRET implicit $x10
-
-  bb.4.bb3:
-    %18:gprb(s64) = G_CONSTANT i64 2
-    $x10 = COPY %18(s64)
-    PseudoRET implicit $x10
-
-  bb.5.bb4:
-    %16:gprb(s64) = G_CONSTANT i64 1
-    $x10 = COPY %16(s64)
-    PseudoRET implicit $x10
-
-  bb.6.bb5:
-    %14:gprb(s64) = G_CONSTANT i64 100
-    $x10 = COPY %14(s64)
-    PseudoRET implicit $x10
-
-  bb.7.bb6:
-    %12:gprb(s64) = G_CONSTANT i64 200
-    $x10 = COPY %12(s64)
-    PseudoRET implicit $x10
-
-  bb.8.default:
-    %24:gprb(s64) = G_CONSTANT i64 1000
-    $x10 = COPY %24(s64)
-    PseudoRET implicit $x10
-
-...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/load-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/load-rv32.mir
index 3964fd1a918aa..be4416e07ccf7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/load-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/load-rv32.mir
@@ -45,6 +45,29 @@ body:            |
     $x10 = COPY %1(s32)
     PseudoRET implicit $x10
 
+...
+---
+name:            load_i8_i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:            |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: load_i8_i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[LBU:%[0-9]+]]:gpr = LBU [[COPY]], 0 :: (load (s8))
+    ; CHECK-NEXT: $x10 = COPY [[LBU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprb(p0) = COPY $x10
+    %1:gprb(s16) = G_LOAD %0(p0) :: (load (s8))
+    %2:gprb(s32) = G_ANYEXT %1
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
 ...
 ---
 name:            load_i16_i16
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/load-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/load-rv64.mir
index 70dd2bfee28ba..b78f78ff5f8b2 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/load-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/load-rv64.mir
@@ -45,6 +45,29 @@ body:            |
     $x10 = COPY %1(s64)
     PseudoRET implicit $x10
 
+...
+---
+name:            load_i8_i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:            |
+  bb.0:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: load_i8_i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[LBU:%[0-9]+]]:gpr = LBU [[COPY]], 0 :: (load (s8))
+    ; CHECK-NEXT: $x10 = COPY [[LBU]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprb(p0) = COPY $x10
+    %1:gprb(s16) = G_LOAD %0(p0) :: (load (s8))
+    %2:gprb(s64) = G_ANYEXT %1
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
 ...
 ---
 name:            load_i16_i16
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/store-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/store-rv32.mir
index f1cc69517cf8f..227eaae555c80 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/store-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/store-rv32.mir
@@ -45,6 +45,29 @@ body:            |
     G_STORE %0(s32), %1(p0) :: (store (s16))
     PseudoRET
 
+...
+---
+name:            store_i8_i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:            |
+  bb.0:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: store_i8_i16
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: SB [[COPY]], [[COPY1]], 0 :: (store (s8))
+    ; CHECK-NEXT: PseudoRET
+    %0:gprb(s32) = COPY $x10
+    %1:gprb(p0) = COPY $x11
+    %2:gprb(s16) = G_TRUNC %0
+    G_STORE %2(s16), %1(p0) :: (store (s8))
+    PseudoRET
+
 ...
 ---
 name:            store_i16_i16
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/store-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/store-rv64.mir
index 69f590c1df597..c177637cfd599 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/store-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/store-rv64.mir
@@ -45,6 +45,29 @@ body:            |
     G_STORE %0(s64), %1(p0) :: (store (s16))
     PseudoRET
 
+...
+---
+name:            store_i8_i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:            |
+  bb.0:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: store_i8_i16
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: SB [[COPY]], [[COPY1]], 0 :: (store (s8))
+    ; CHECK-NEXT: PseudoRET
+    %0:gprb(s64) = COPY $x10
+    %1:gprb(p0) = COPY $x11
+    %2:gprb(s16) = G_TRUNC %0
+    G_STORE %2(s16), %1(p0) :: (store (s8))
+    PseudoRET
+
 ...
 ---
 name:            store_i16_i16
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/jumptable.ll b/llvm/test/CodeGen/RISCV/GlobalISel/jumptable.ll
index 9dda1a241e042..c56a8ca5755ce 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/jumptable.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/jumptable.ll
@@ -94,7 +94,7 @@ define void @above_threshold(i32 signext %in, ptr %out) nounwind {
 ; RV32I-PIC-NEXT:    slli a0, a0, 2
 ; RV32I-PIC-NEXT:    add a0, a2, a0
 ; RV32I-PIC-NEXT:    lw a0, 0(a0)
-; RV32I-PIC-NEXT:    add a0, a0, a2
+; RV32I-PIC-NEXT:    add a0, a2, a0
 ; RV32I-PIC-NEXT:    jr a0
 ; RV32I-PIC-NEXT:  .LBB0_2: # %bb1
 ; RV32I-PIC-NEXT:    li a0, 4
@@ -199,7 +199,7 @@ define void @above_threshold(i32 signext %in, ptr %out) nounwind {
 ; RV64I-PIC-NEXT:    slli a0, a0, 2
 ; RV64I-PIC-NEXT:    add a0, a2, a0
 ; RV64I-PIC-NEXT:    lw a0, 0(a0)
-; RV64I-PIC-NEXT:    add a0, a0, a2
+; RV64I-PIC-NEXT:    add a0, a2, a0
 ; RV64I-PIC-NEXT:    jr a0
 ; RV64I-PIC-NEXT:  .LBB0_2: # %bb1
 ; RV64I-PIC-NEXT:    li a0, 4
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-medium-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-medium-rv64.mir
new file mode 100644
index 0000000000000..4b1cb14b273e2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-medium-rv64.mir
@@ -0,0 +1,162 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \
+# RUN:   -code-model=medium | FileCheck %s
+
+--- |
+  define i32 @jt_test(i32 signext %in) {
+  entry:
+    %0 = sext i32 %in to i64
+    switch i64 %0, label %default [
+      i64 1, label %bb1
+      i64 2, label %bb2
+      i64 3, label %bb3
+      i64 4, label %bb4
+      i64 5, label %bb5
+      i64 6, label %bb6
+    ]
+
+  bb1:
+    ret i32 4
+
+  bb2:
+    ret i32 3
+
+  bb3:
+    ret i32 2
+
+  bb4:
+    ret i32 1
+
+  bb5:
+    ret i32 100
+
+  bb6:
+    ret i32 200
+
+  default:
+    ret i32 1000
+  }
+
+...
+---
+name:            jt_test
+tracksRegLiveness: true
+jumpTable:
+  kind:            block-address
+  entries:
+    - id:              0
+      blocks:          [ '%bb.2', '%bb.3', '%bb.4', '%bb.5', '%bb.6', '%bb.7' ]
+body:             |
+  ; CHECK-LABEL: name: jt_test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s64) = G_ASSERT_SEXT [[COPY]], 32
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   [[SUB:%[0-9]+]]:_(s64) = G_SUB [[ASSERT_SEXT]], [[C1]]
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[SUB]](s64), [[C]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s64), %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[JUMP_TABLE:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.0
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SUB]], [[C2]](s64)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[JUMP_TABLE]], [[SHL]](s64)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from jump-table)
+  ; CHECK-NEXT:   G_BRINDIRECT [[LOAD]](p0)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb1:
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK-NEXT:   $x10 = COPY [[C3]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.bb2:
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+  ; CHECK-NEXT:   $x10 = COPY [[C4]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.bb3:
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+  ; CHECK-NEXT:   $x10 = COPY [[C5]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.bb4:
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   $x10 = COPY [[C6]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.bb5:
+  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 100
+  ; CHECK-NEXT:   $x10 = COPY [[C7]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.bb6:
+  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 200
+  ; CHECK-NEXT:   $x10 = COPY [[C8]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8.default:
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 1000
+  ; CHECK-NEXT:   $x10 = COPY [[C9]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  bb.1.entry:
+    successors: %bb.8, %bb.9
+    liveins: $x10
+
+    %1:_(s64) = COPY $x10
+    %2:_(s64) = G_ASSERT_SEXT %1, 32
+    %0:_(s32) = G_TRUNC %2(s64)
+    %7:_(s64) = G_CONSTANT i64 5
+    %3:_(s64) = G_SEXT %0(s32)
+    %4:_(s64) = G_CONSTANT i64 1
+    %5:_(s64) = G_SUB %3, %4
+    %9:_(s1) = G_ICMP intpred(ugt), %5(s64), %7
+    G_BRCOND %9(s1), %bb.8
+
+  bb.9.entry:
+    successors: %bb.2, %bb.3, %bb.4, %bb.5, %bb.6, %bb.7
+
+    %10:_(p0) = G_JUMP_TABLE %jump-table.0
+    G_BRJT %10(p0), %jump-table.0, %5(s64)
+
+  bb.2.bb1:
+    %22:_(s64) = G_CONSTANT i64 4
+    $x10 = COPY %22(s64)
+    PseudoRET implicit $x10
+
+  bb.3.bb2:
+    %20:_(s64) = G_CONSTANT i64 3
+    $x10 = COPY %20(s64)
+    PseudoRET implicit $x10
+
+  bb.4.bb3:
+    %18:_(s64) = G_CONSTANT i64 2
+    $x10 = COPY %18(s64)
+    PseudoRET implicit $x10
+
+  bb.5.bb4:
+    %16:_(s64) = G_CONSTANT i64 1
+    $x10 = COPY %16(s64)
+    PseudoRET implicit $x10
+
+  bb.6.bb5:
+    %14:_(s64) = G_CONSTANT i64 100
+    $x10 = COPY %14(s64)
+    PseudoRET implicit $x10
+
+  bb.7.bb6:
+    %12:_(s64) = G_CONSTANT i64 200
+    $x10 = COPY %12(s64)
+    PseudoRET implicit $x10
+
+  bb.8.default:
+    %24:_(s64) = G_CONSTANT i64 1000
+    $x10 = COPY %24(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-pic-rv32.mir
similarity index 57%
rename from llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir
rename to llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-pic-rv32.mir
index 396421a4ba739..d48d21896f966 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-pic-rv32.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=riscv32 -run-pass=instruction-select %s -o - \
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
 # RUN:   -relocation-model=pic | FileCheck %s
 
 --- |
@@ -39,8 +39,6 @@
 ...
 ---
 name:            jt_test
-legalized:       true
-regBankSelected: true
 tracksRegLiveness: true
 jumpTable:
   kind:            label-difference32
@@ -53,77 +51,80 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $x10
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 5
-  ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 200
-  ; CHECK-NEXT:   [[ADDI2:%[0-9]+]]:gpr = ADDI $x0, 100
-  ; CHECK-NEXT:   [[ADDI3:%[0-9]+]]:gpr = ADDI $x0, 1
-  ; CHECK-NEXT:   [[ADDI4:%[0-9]+]]:gpr = ADDI $x0, 2
-  ; CHECK-NEXT:   [[ADDI5:%[0-9]+]]:gpr = ADDI $x0, 3
-  ; CHECK-NEXT:   [[ADDI6:%[0-9]+]]:gpr = ADDI $x0, 4
-  ; CHECK-NEXT:   [[ADDI7:%[0-9]+]]:gpr = ADDI $x0, 1000
-  ; CHECK-NEXT:   [[ADDI8:%[0-9]+]]:gpr = ADDI [[COPY]], -1
-  ; CHECK-NEXT:   BLTU [[ADDI]], [[ADDI8]], %bb.8
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 200
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
+  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C8]]
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[ADD]](s32), [[C]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA %jump-table.0
-  ; CHECK-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[ADDI8]], 2
-  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[PseudoLLA]], [[SLLI]]
-  ; CHECK-NEXT:   [[LW:%[0-9]+]]:gpr = LW [[ADD]], 0 :: (load (s32) from jump-table)
-  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:gprjalr = ADD [[LW]], [[PseudoLLA]]
-  ; CHECK-NEXT:   PseudoBRIND [[ADD1]], 0
+  ; CHECK-NEXT:   [[JUMP_TABLE:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.0
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C9]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[JUMP_TABLE]], [[SHL]](s32)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from jump-table)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[JUMP_TABLE]], [[LOAD]](s32)
+  ; CHECK-NEXT:   G_BRINDIRECT [[PTR_ADD1]](p0)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI6]]
+  ; CHECK-NEXT:   $x10 = COPY [[C6]](s32)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.bb2:
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI5]]
+  ; CHECK-NEXT:   $x10 = COPY [[C5]](s32)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4.bb3:
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI4]]
+  ; CHECK-NEXT:   $x10 = COPY [[C4]](s32)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5.bb4:
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI3]]
+  ; CHECK-NEXT:   $x10 = COPY [[C3]](s32)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6.bb5:
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI2]]
+  ; CHECK-NEXT:   $x10 = COPY [[C2]](s32)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7.bb6:
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI1]]
+  ; CHECK-NEXT:   $x10 = COPY [[C1]](s32)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8.default:
-  ; CHECK-NEXT:   $x10 = COPY [[ADDI7]]
+  ; CHECK-NEXT:   $x10 = COPY [[C7]](s32)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   bb.1.entry:
     successors: %bb.8, %bb.9
     liveins: $x10
 
-    %0:gprb(s32) = COPY $x10
-    %4:gprb(s32) = G_CONSTANT i32 5
-    %8:gprb(s32) = G_CONSTANT i32 200
-    %9:gprb(s32) = G_CONSTANT i32 100
-    %10:gprb(s32) = G_CONSTANT i32 1
-    %11:gprb(s32) = G_CONSTANT i32 2
-    %12:gprb(s32) = G_CONSTANT i32 3
-    %13:gprb(s32) = G_CONSTANT i32 4
-    %14:gprb(s32) = G_CONSTANT i32 1000
-    %1:gprb(s32) = G_CONSTANT i32 -1
-    %2:gprb(s32) = G_ADD %0, %1
-    %16:gprb(s32) = G_ICMP intpred(ugt), %2(s32), %4
-    G_BRCOND %16(s32), %bb.8
+    %0:_(s32) = COPY $x10
+    %4:_(s32) = G_CONSTANT i32 5
+    %8:_(s32) = G_CONSTANT i32 200
+    %9:_(s32) = G_CONSTANT i32 100
+    %10:_(s32) = G_CONSTANT i32 1
+    %11:_(s32) = G_CONSTANT i32 2
+    %12:_(s32) = G_CONSTANT i32 3
+    %13:_(s32) = G_CONSTANT i32 4
+    %14:_(s32) = G_CONSTANT i32 1000
+    %1:_(s32) = G_CONSTANT i32 -1
+    %2:_(s32) = G_ADD %0, %1
+    %6:_(s1) = G_ICMP intpred(ugt), %2(s32), %4
+    G_BRCOND %6(s1), %bb.8
 
   bb.9.entry:
     successors: %bb.2, %bb.3, %bb.4, %bb.5, %bb.6, %bb.7
 
-    %7:gprb(p0) = G_JUMP_TABLE %jump-table.0
+    %7:_(p0) = G_JUMP_TABLE %jump-table.0
     G_BRJT %7(p0), %jump-table.0, %2(s32)
 
   bb.2.bb1:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-pic-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-pic-rv64.mir
new file mode 100644
index 0000000000000..0b5357a62d0a8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-pic-rv64.mir
@@ -0,0 +1,163 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \
+# RUN:   -relocation-model=pic | FileCheck %s
+
+--- |
+  define i32 @jt_test(i32 signext %in) {
+  entry:
+    %0 = sext i32 %in to i64
+    switch i64 %0, label %default [
+      i64 1, label %bb1
+      i64 2, label %bb2
+      i64 3, label %bb3
+      i64 4, label %bb4
+      i64 5, label %bb5
+      i64 6, label %bb6
+    ]
+
+  bb1:
+    ret i32 4
+
+  bb2:
+    ret i32 3
+
+  bb3:
+    ret i32 2
+
+  bb4:
+    ret i32 1
+
+  bb5:
+    ret i32 100
+
+  bb6:
+    ret i32 200
+
+  default:
+    ret i32 1000
+  }
+
+...
+---
+name:            jt_test
+tracksRegLiveness: true
+jumpTable:
+  kind:            label-difference32
+  entries:
+    - id:              0
+      blocks:          [ '%bb.2', '%bb.3', '%bb.4', '%bb.5', '%bb.6', '%bb.7' ]
+body:             |
+  ; CHECK-LABEL: name: jt_test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+  ; CHECK-NEXT:   [[ASSERT_SEXT:%[0-9]+]]:_(s64) = G_ASSERT_SEXT [[COPY]], 32
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   [[SUB:%[0-9]+]]:_(s64) = G_SUB [[ASSERT_SEXT]], [[C1]]
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[SUB]](s64), [[C]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s64), %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[JUMP_TABLE:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.0
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SUB]], [[C2]](s64)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[JUMP_TABLE]], [[SHL]](s64)
+  ; CHECK-NEXT:   [[SEXTLOAD:%[0-9]+]]:_(s64) = G_SEXTLOAD [[PTR_ADD]](p0) :: (load (s32) from jump-table)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[JUMP_TABLE]], [[SEXTLOAD]](s64)
+  ; CHECK-NEXT:   G_BRINDIRECT [[PTR_ADD1]](p0)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb1:
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK-NEXT:   $x10 = COPY [[C3]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.bb2:
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+  ; CHECK-NEXT:   $x10 = COPY [[C4]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.bb3:
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+  ; CHECK-NEXT:   $x10 = COPY [[C5]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.bb4:
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   $x10 = COPY [[C6]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.bb5:
+  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 100
+  ; CHECK-NEXT:   $x10 = COPY [[C7]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.bb6:
+  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 200
+  ; CHECK-NEXT:   $x10 = COPY [[C8]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8.default:
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 1000
+  ; CHECK-NEXT:   $x10 = COPY [[C9]](s64)
+  ; CHECK-NEXT:   PseudoRET implicit $x10
+  bb.1.entry:
+    successors: %bb.8, %bb.9
+    liveins: $x10
+
+    %1:_(s64) = COPY $x10
+    %2:_(s64) = G_ASSERT_SEXT %1, 32
+    %0:_(s32) = G_TRUNC %2(s64)
+    %7:_(s64) = G_CONSTANT i64 5
+    %3:_(s64) = G_SEXT %0(s32)
+    %4:_(s64) = G_CONSTANT i64 1
+    %5:_(s64) = G_SUB %3, %4
+    %9:_(s1) = G_ICMP intpred(ugt), %5(s64), %7
+    G_BRCOND %9(s1), %bb.8
+
+  bb.9.entry:
+    successors: %bb.2, %bb.3, %bb.4, %bb.5, %bb.6, %bb.7
+
+    %10:_(p0) = G_JUMP_TABLE %jump-table.0
+    G_BRJT %10(p0), %jump-table.0, %5(s64)
+
+  bb.2.bb1:
+    %22:_(s64) = G_CONSTANT i64 4
+    $x10 = COPY %22(s64)
+    PseudoRET implicit $x10
+
+  bb.3.bb2:
+    %20:_(s64) = G_CONSTANT i64 3
+    $x10 = COPY %20(s64)
+    PseudoRET implicit $x10
+
+  bb.4.bb3:
+    %18:_(s64) = G_CONSTANT i64 2
+    $x10 = COPY %18(s64)
+    PseudoRET implicit $x10
+
+  bb.5.bb4:
+    %16:_(s64) = G_CONSTANT i64 1
+    $x10 = COPY %16(s64)
+    PseudoRET implicit $x10
+
+  bb.6.bb5:
+    %14:_(s64) = G_CONSTANT i64 100
+    $x10 = COPY %14(s64)
+    PseudoRET implicit $x10
+
+  bb.7.bb6:
+    %12:_(s64) = G_CONSTANT i64 200
+    $x10 = COPY %12(s64)
+    PseudoRET implicit $x10
+
+  bb.8.default:
+    %24:_(s64) = G_CONSTANT i64 1000
+    $x10 = COPY %24(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-rv32.mir
index a7fd7da77261f..97e2badb3ed0e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-rv32.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
-# RUN: | FileCheck %s
+# RUN:   -code-model=small | FileCheck %s
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
+# RUN:   -code-model=medium | FileCheck %s
 
 --- |
   define i32 @jt_test(i32 signext %in) {
@@ -69,7 +71,11 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[JUMP_TABLE:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.0
-  ; CHECK-NEXT:   G_BRJT [[JUMP_TABLE]](p0), %jump-table.0, [[SUB]](s32)
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SUB]], [[C9]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[JUMP_TABLE]], [[SHL]](s32)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from jump-table)
+  ; CHECK-NEXT:   G_BRINDIRECT [[LOAD]](p0)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:
   ; CHECK-NEXT:   $x10 = COPY [[C6]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-small-rv64.mir
similarity index 81%
rename from llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-rv64.mir
rename to llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-small-rv64.mir
index bf0661b88a78b..f3b8310df91ec 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-jump-table-brjt-small-rv64.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \
-# RUN: | FileCheck %s
+# RUN:   -code-model=small | FileCheck %s
 
 --- |
   define i32 @jt_test(i32 signext %in) {
@@ -64,41 +64,46 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x15555555), %bb.3(0x15555555), %bb.4(0x15555555), %bb.5(0x15555555), %bb.6(0x15555555), %bb.7(0x15555555)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[JUMP_TABLE:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.0
-  ; CHECK-NEXT:   G_BRJT [[JUMP_TABLE]](p0), %jump-table.0, [[SUB]](s64)
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SUB]], [[C2]](s64)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[JUMP_TABLE]], [[SHL]](s64)
+  ; CHECK-NEXT:   [[SEXTLOAD:%[0-9]+]]:_(s64) = G_SEXTLOAD [[PTR_ADD]](p0) :: (load (s32) from jump-table)
+  ; CHECK-NEXT:   [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SEXTLOAD]](s64)
+  ; CHECK-NEXT:   G_BRINDIRECT [[INTTOPTR]](p0)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:
-  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; CHECK-NEXT:   $x10 = COPY [[C2]](s64)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK-NEXT:   $x10 = COPY [[C3]](s64)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.bb2:
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-  ; CHECK-NEXT:   $x10 = COPY [[C3]](s64)
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+  ; CHECK-NEXT:   $x10 = COPY [[C4]](s64)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4.bb3:
-  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-  ; CHECK-NEXT:   $x10 = COPY [[C4]](s64)
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+  ; CHECK-NEXT:   $x10 = COPY [[C5]](s64)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5.bb4:
-  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-  ; CHECK-NEXT:   $x10 = COPY [[C5]](s64)
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   $x10 = COPY [[C6]](s64)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6.bb5:
-  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 100
-  ; CHECK-NEXT:   $x10 = COPY [[C6]](s64)
+  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 100
+  ; CHECK-NEXT:   $x10 = COPY [[C7]](s64)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7.bb6:
-  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 200
-  ; CHECK-NEXT:   $x10 = COPY [[C7]](s64)
+  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 200
+  ; CHECK-NEXT:   $x10 = COPY [[C8]](s64)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8.default:
-  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 1000
-  ; CHECK-NEXT:   $x10 = COPY [[C8]](s64)
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 1000
+  ; CHECK-NEXT:   $x10 = COPY [[C9]](s64)
   ; CHECK-NEXT:   PseudoRET implicit $x10
   bb.1.entry:
     successors: %bb.8, %bb.9
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv32.mir
index caa7a775e54a3..93b145c5049fd 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv32.mir
@@ -25,16 +25,18 @@ body:             |
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; CHECK-NEXT: $x10 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     ;
     ; UNALIGNED-LABEL: name: load_i8
     ; UNALIGNED: liveins: $x10
     ; UNALIGNED-NEXT: {{  $}}
     ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
-    ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32)
+    ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
+    ; UNALIGNED-NEXT: $x10 = COPY [[ANYEXT]](s32)
     ; UNALIGNED-NEXT: PseudoRET implicit $x10
     %0:_(p0) = COPY $x10
     %1:_(s8) = G_LOAD %0(p0) :: (load (s8))
@@ -231,9 +233,10 @@ body:             |
     ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[C1]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CHECK-NEXT: $x10 = COPY [[OR]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv64.mir
index 40897b845ede7..d85d2c5f51c72 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv64.mir
@@ -25,8 +25,8 @@ body:             |
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s16)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     ;
@@ -34,8 +34,8 @@ body:             |
     ; UNALIGNED: liveins: $x10
     ; UNALIGNED-NEXT: {{  $}}
     ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
-    ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s16)
     ; UNALIGNED-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; UNALIGNED-NEXT: PseudoRET implicit $x10
     %0:_(p0) = COPY $x10
@@ -274,9 +274,9 @@ body:             |
     ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s64) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s16)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C1]](s64)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CHECK-NEXT: $x10 = COPY [[OR]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv32.mir
index 29466839089bf..5a7a042ab4afd 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv32.mir
@@ -26,7 +26,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
-    ; CHECK-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s8))
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[COPY1]](p0) :: (store (s8))
     ; CHECK-NEXT: PseudoRET
     ;
     ; UNALIGNED-LABEL: name: store_i8
@@ -34,7 +35,8 @@ body:             |
     ; UNALIGNED-NEXT: {{  $}}
     ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
     ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
-    ; UNALIGNED-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s8))
+    ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s16), [[COPY1]](p0) :: (store (s8))
     ; UNALIGNED-NEXT: PseudoRET
     %2:_(s32) = COPY $x10
     %0:_(s8) = G_TRUNC %2(s32)
@@ -228,15 +230,18 @@ body:             |
     ; CHECK: liveins: $x10, $x11
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C2]](s32)
-    ; CHECK-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s8))
-    ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 1)
+    ; CHECK-NEXT: G_STORE [[COPY2]](s16), [[COPY1]](p0) :: (store (s8))
+    ; CHECK-NEXT: G_STORE [[TRUNC1]](s16), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 1)
     ; CHECK-NEXT: PseudoRET
     ;
     ; UNALIGNED-LABEL: name: store_i16_unaligned
@@ -280,19 +285,23 @@ body:             |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C2]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C4]](s32)
-    ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store (s8))
-    ; CHECK-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p0) :: (store (s8) into unknown-address + 1)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[COPY1]](p0) :: (store (s8))
+    ; CHECK-NEXT: G_STORE [[TRUNC1]](s16), [[PTR_ADD1]](p0) :: (store (s8) into unknown-address + 1)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C5]](s32)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
     ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C4]](s32)
-    ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2)
-    ; CHECK-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p0) :: (store (s8) into unknown-address + 3)
+    ; CHECK-NEXT: G_STORE [[TRUNC2]](s16), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2)
+    ; CHECK-NEXT: G_STORE [[TRUNC3]](s16), [[PTR_ADD2]](p0) :: (store (s8) into unknown-address + 3)
     ; CHECK-NEXT: PseudoRET
     ;
     ; UNALIGNED-LABEL: name: store_i32_unaligned
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv64.mir
index 280b3add09330..8704ddec39f4f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv64.mir
@@ -26,8 +26,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s8))
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[COPY1]](p0) :: (store (s8))
     ; CHECK-NEXT: PseudoRET
     ;
     ; UNALIGNED-LABEL: name: store_i8
@@ -35,8 +35,8 @@ body:             |
     ; UNALIGNED-NEXT: {{  $}}
     ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
-    ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s8))
+    ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s16), [[COPY1]](p0) :: (store (s8))
     ; UNALIGNED-NEXT: PseudoRET
     %2:_(s64) = COPY $x10
     %0:_(s8) = G_TRUNC %2(s64)
@@ -259,17 +259,18 @@ body:             |
     ; CHECK: liveins: $x10, $x11
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[AND]], [[C]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C2]](s64)
-    ; CHECK-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s8))
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
-    ; CHECK-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 1)
+    ; CHECK-NEXT: G_STORE [[COPY2]](s16), [[COPY1]](p0) :: (store (s8))
+    ; CHECK-NEXT: G_STORE [[TRUNC1]](s16), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 1)
     ; CHECK-NEXT: PseudoRET
     ;
     ; UNALIGNED-LABEL: name: store_i16_unaligned
@@ -308,32 +309,31 @@ body:             |
     ; CHECK: liveins: $x10, $x11
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[AND]], [[C]](s64)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C2]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C4]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s64)
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C5]](s64)
-    ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store (s8))
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR1]](s64)
-    ; CHECK-NEXT: G_STORE [[TRUNC2]](s32), [[PTR_ADD1]](p0) :: (store (s8) into unknown-address + 1)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[COPY1]](p0) :: (store (s8))
+    ; CHECK-NEXT: G_STORE [[TRUNC1]](s16), [[PTR_ADD1]](p0) :: (store (s8) into unknown-address + 1)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64)
     ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C4]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[AND2]], [[C6]](s64)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s64)
     ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C5]](s64)
-    ; CHECK-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2)
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR2]](s64)
-    ; CHECK-NEXT: G_STORE [[TRUNC3]](s32), [[PTR_ADD2]](p0) :: (store (s8) into unknown-address + 3)
+    ; CHECK-NEXT: G_STORE [[TRUNC2]](s16), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2)
+    ; CHECK-NEXT: G_STORE [[TRUNC3]](s16), [[PTR_ADD2]](p0) :: (store (s8) into unknown-address + 3)
     ; CHECK-NEXT: PseudoRET
     ;
     ; UNALIGNED-LABEL: name: store_i32_unaligned
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f61b403cc7c53..1c8c459d1b316 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -126,11 +126,11 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFWMA %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
+; RUN: llc -mtriple=riscv32 -mattr=+zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV32ZALASR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zama16b %s -o - | FileCheck --check-prefixes=CHECK,RV32ZAMA16B %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV32ZICFILP %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+zabha %s -o - | FileCheck --check-prefix=RV32ZABHA %s
+; RUN: llc -mtriple=riscv32 -mattr=+zabha %s -o - | FileCheck --check-prefix=RV32ZABHA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+experimental-zvbc32e  %s -o - | FileCheck --check-prefix=RV32ZVBC32E %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+experimental-zvkgs  %s -o - | FileCheck --check-prefix=RV32ZVKGS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+ssnpm  %s -o - | FileCheck --check-prefix=RV32SSNPM %s
@@ -274,10 +274,10 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFWMA %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas %s -o - | FileCheck --check-prefix=RV64ZACAS %s
+; RUN: llc -mtriple=riscv64 -mattr=+zacas %s -o - | FileCheck --check-prefix=RV64ZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV64ZALASR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV64ZICFILP %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha %s -o - | FileCheck --check-prefix=RV64ZABHA %s
+; RUN: llc -mtriple=riscv64 -mattr=+zabha %s -o - | FileCheck --check-prefix=RV64ZABHA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+experimental-zvbc32e  %s -o - | FileCheck --check-prefix=RV64ZVBC32E %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+experimental-zvkgs  %s -o - | FileCheck --check-prefix=RV64ZVKGS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+ssnpm  %s -o - | FileCheck --check-prefix=RV64SSNPM %s
@@ -428,11 +428,11 @@
 ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 ; RV32ZVFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0"
 ; RV32ZVFBFWMA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvfbfwma1p0_zvl32b1p0"
-; RV32ZACAS: .attribute 5, "rv32i2p1_a2p1_zaamo1p0_zacas1p0"
+; RV32ZACAS: .attribute 5, "rv32i2p1_zaamo1p0_zacas1p0"
 ; RV32ZALASR: .attribute 5, "rv32i2p1_zalasr0p1"
 ; RV32ZAMA16B: .attribute 5, "rv32i2p1_zama16b1p0"
 ; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp1p0_zicsr2p0"
-; RV32ZABHA: .attribute 5, "rv32i2p1_a2p1_zaamo1p0_zabha1p0"
+; RV32ZABHA: .attribute 5, "rv32i2p1_zaamo1p0_zabha1p0"
 ; RV32ZVBC32E: .attribute 5, "rv32i2p1_zicsr2p0_zvbc32e0p7_zve32x1p0_zvl32b1p0"
 ; RV32ZVKGS: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvkg1p0_zvkgs0p7_zvl32b1p0"
 ; RV32SSNPM: .attribute 5, "rv32i2p1_ssnpm1p0"
@@ -574,10 +574,10 @@
 ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 ; RV64ZVFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0"
 ; RV64ZVFBFWMA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvfbfwma1p0_zvl32b1p0"
-; RV64ZACAS: .attribute 5, "rv64i2p1_a2p1_zaamo1p0_zacas1p0"
+; RV64ZACAS: .attribute 5, "rv64i2p1_zaamo1p0_zacas1p0"
 ; RV64ZALASR: .attribute 5, "rv64i2p1_zalasr0p1"
 ; RV64ZICFILP: .attribute 5, "rv64i2p1_zicfilp1p0_zicsr2p0"
-; RV64ZABHA: .attribute 5, "rv64i2p1_a2p1_zaamo1p0_zabha1p0"
+; RV64ZABHA: .attribute 5, "rv64i2p1_zaamo1p0_zabha1p0"
 ; RV64ZVBC32E: .attribute 5, "rv64i2p1_zicsr2p0_zvbc32e0p7_zve32x1p0_zvl32b1p0"
 ; RV64ZVKGS: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvkg1p0_zvkgs0p7_zvl32b1p0"
 ; RV64SSNPM: .attribute 5, "rv64i2p1_ssnpm1p0"
diff --git a/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll b/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll
new file mode 100644
index 0000000000000..04a5d268aebff
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+define i64 @test_Pr_wide_scalar_simple(i64 noundef %0) nounwind {
+; CHECK-LABEL: test_Pr_wide_scalar_simple:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # a2 <- a0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:    ret
+entry:
+  %1 = call i64 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i64 %0)
+  ret i64 %1
+}
+
+define i32 @test_Pr_wide_scalar_with_ops(i32 noundef %0) nounwind {
+; CHECK-LABEL: test_Pr_wide_scalar_with_ops:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # a2 <- a0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or a0, a2, a3
+; CHECK-NEXT:    ret
+entry:
+  %1 = zext i32 %0 to i64
+  %2 = shl i64 %1, 32
+  %3 = or i64 %1, %2
+  %4 = call i64 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i64 %3)
+  %5 = trunc i64 %4 to i32
+  %6 = lshr i64 %4, 32
+  %7 = trunc i64 %6 to i32
+  %8 = or i32 %5, %7
+  ret i32 %8
+}
+
+define i64 @test_Pr_wide_scalar_inout(ptr %0, i64 noundef %1) nounwind {
+; CHECK-LABEL: test_Pr_wide_scalar_inout:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    sw a0, 12(sp)
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    sw a1, 0(sp)
+; CHECK-NEXT:    sw a3, 4(sp)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # a0; a2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    sw a0, 12(sp)
+; CHECK-NEXT:    sw a2, 0(sp)
+; CHECK-NEXT:    sw a3, 4(sp)
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+entry:
+  %2 = alloca ptr, align 4
+  %3 = alloca i64, align 8
+  store ptr %0, ptr %2, align 4
+  store i64 %1, ptr %3, align 8
+  %4 = load ptr, ptr %2, align 4
+  %5 = load i64, ptr %3, align 8
+  %6 = call { ptr, i64 } asm sideeffect "/* $0; $1 */", "=r,=R,0,1"(ptr %4, i64 %5)
+  %7 = extractvalue { ptr, i64} %6, 0
+  %8 = extractvalue { ptr, i64 } %6, 1
+  store ptr %7, ptr %2, align 4
+  store i64 %8, ptr %3, align 8
+  %9 = load i64, ptr %3, align 8
+  ret i64 %9
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll b/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll
new file mode 100644
index 0000000000000..41f353d0781ae
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+define i128 @test_R_wide_scalar_simple(i128 noundef %0) nounwind {
+; CHECK-LABEL: test_R_wide_scalar_simple:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # a2 <- a0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:    ret
+entry:
+  %1 = call i128 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i128 %0)
+  ret i128 %1
+}
+
+define i64 @test_R_wide_scalar_with_ops(i64 noundef %0) nounwind {
+; CHECK-LABEL: test_R_wide_scalar_with_ops:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # a2 <- a0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    or a0, a2, a3
+; CHECK-NEXT:    ret
+entry:
+  %1 = zext i64 %0 to i128
+  %2 = shl i128 %1, 64
+  %3 = or i128 %1, %2
+  %4 = call i128 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i128 %3)
+  %5 = trunc i128 %4 to i64
+  %6 = lshr i128 %4, 64
+  %7 = trunc i128 %6 to i64
+  %8 = or i64 %5, %7
+  ret i64 %8
+}
+
+define i128 @test_R_wide_scalar_inout(ptr %0, i128 noundef %1) nounwind {
+; CHECK-LABEL: test_R_wide_scalar_inout:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    sd a0, 24(sp)
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    sd a1, 0(sp)
+; CHECK-NEXT:    sd a3, 8(sp)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # a0; a2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    sd a0, 24(sp)
+; CHECK-NEXT:    sd a2, 0(sp)
+; CHECK-NEXT:    sd a3, 8(sp)
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:    addi sp, sp, 32
+; CHECK-NEXT:    ret
+entry:
+  %2 = alloca ptr, align 8
+  %3 = alloca i128, align 16
+  store ptr %0, ptr %2, align 8
+  store i128 %1, ptr %3, align 16
+  %4 = load ptr, ptr %2, align 8
+  %5 = load i128, ptr %3, align 16
+  %6 = call { ptr, i128 } asm sideeffect "/* $0; $1 */", "=r,=R,0,1"(ptr %4, i128 %5)
+  %7 = extractvalue { ptr, i128} %6, 0
+  %8 = extractvalue { ptr, i128 } %6, 1
+  store ptr %7, ptr %2, align 8
+  store i128 %8, ptr %3, align 16
+  %9 = load i128, ptr %3, align 16
+  ret i128 %9
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index d506842b5eff6..1a08c613ca36a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTZVE32F
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTV
 
 %struct.foo = type { i32, i32, i32, i32 }
 
@@ -677,6 +677,54 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-NEXT:    bne a0, a4, .LBB12_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
+;
+; OPTZVE32F-LABEL: gather_of_pointers:
+; OPTZVE32F:       # %bb.0: # %bb
+; OPTZVE32F-NEXT:    lui a2, 2
+; OPTZVE32F-NEXT:    add a2, a0, a2
+; OPTZVE32F-NEXT:    li a3, 40
+; OPTZVE32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; OPTZVE32F-NEXT:  .LBB12_1: # %bb2
+; OPTZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
+; OPTZVE32F-NEXT:    vlse64.v v8, (a1), a3
+; OPTZVE32F-NEXT:    addi a4, a1, 80
+; OPTZVE32F-NEXT:    vlse64.v v9, (a4), a3
+; OPTZVE32F-NEXT:    addi a4, a0, 16
+; OPTZVE32F-NEXT:    vse64.v v8, (a0)
+; OPTZVE32F-NEXT:    addi a0, a0, 32
+; OPTZVE32F-NEXT:    vse64.v v9, (a4)
+; OPTZVE32F-NEXT:    addi a1, a1, 160
+; OPTZVE32F-NEXT:    bne a0, a2, .LBB12_1
+; OPTZVE32F-NEXT:  # %bb.2: # %bb18
+; OPTZVE32F-NEXT:    ret
+;
+; OPTV-LABEL: gather_of_pointers:
+; OPTV:       # %bb.0: # %bb
+; OPTV-NEXT:    li a2, 0
+; OPTV-NEXT:    lui a4, 2
+; OPTV-NEXT:    li a3, 1
+; OPTV-NEXT:    add a4, a0, a4
+; OPTV-NEXT:    li a5, 40
+; OPTV-NEXT:  .LBB12_1: # %bb2
+; OPTV-NEXT:    # =>This Inner Loop Header: Depth=1
+; OPTV-NEXT:    mul a6, a3, a5
+; OPTV-NEXT:    mul a7, a2, a5
+; OPTV-NEXT:    addi a2, a2, 4
+; OPTV-NEXT:    add a6, a1, a6
+; OPTV-NEXT:    add a7, a1, a7
+; OPTV-NEXT:    ld t0, 0(a7)
+; OPTV-NEXT:    ld t1, 0(a6)
+; OPTV-NEXT:    ld a7, 80(a7)
+; OPTV-NEXT:    ld a6, 80(a6)
+; OPTV-NEXT:    sd t0, 0(a0)
+; OPTV-NEXT:    sd t1, 8(a0)
+; OPTV-NEXT:    sd a7, 16(a0)
+; OPTV-NEXT:    sd a6, 24(a0)
+; OPTV-NEXT:    addi a0, a0, 32
+; OPTV-NEXT:    addi a3, a3, 4
+; OPTV-NEXT:    bne a0, a4, .LBB12_1
+; OPTV-NEXT:  # %bb.2: # %bb18
+; OPTV-NEXT:    ret
 bb:
   br label %bb2
 
@@ -754,6 +802,54 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    bne a1, a4, .LBB13_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
+;
+; OPTZVE32F-LABEL: scatter_of_pointers:
+; OPTZVE32F:       # %bb.0: # %bb
+; OPTZVE32F-NEXT:    lui a2, 2
+; OPTZVE32F-NEXT:    add a2, a1, a2
+; OPTZVE32F-NEXT:    li a3, 40
+; OPTZVE32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; OPTZVE32F-NEXT:  .LBB13_1: # %bb2
+; OPTZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
+; OPTZVE32F-NEXT:    addi a4, a1, 16
+; OPTZVE32F-NEXT:    vle64.v v8, (a1)
+; OPTZVE32F-NEXT:    vle64.v v9, (a4)
+; OPTZVE32F-NEXT:    addi a4, a0, 80
+; OPTZVE32F-NEXT:    addi a1, a1, 32
+; OPTZVE32F-NEXT:    vsse64.v v8, (a0), a3
+; OPTZVE32F-NEXT:    vsse64.v v9, (a4), a3
+; OPTZVE32F-NEXT:    addi a0, a0, 160
+; OPTZVE32F-NEXT:    bne a1, a2, .LBB13_1
+; OPTZVE32F-NEXT:  # %bb.2: # %bb18
+; OPTZVE32F-NEXT:    ret
+;
+; OPTV-LABEL: scatter_of_pointers:
+; OPTV:       # %bb.0: # %bb
+; OPTV-NEXT:    li a2, 0
+; OPTV-NEXT:    lui a4, 2
+; OPTV-NEXT:    li a3, 1
+; OPTV-NEXT:    add a4, a1, a4
+; OPTV-NEXT:    li a5, 40
+; OPTV-NEXT:  .LBB13_1: # %bb2
+; OPTV-NEXT:    # =>This Inner Loop Header: Depth=1
+; OPTV-NEXT:    ld a6, 0(a1)
+; OPTV-NEXT:    ld a7, 8(a1)
+; OPTV-NEXT:    ld t0, 16(a1)
+; OPTV-NEXT:    ld t1, 24(a1)
+; OPTV-NEXT:    mul t2, a3, a5
+; OPTV-NEXT:    mul t3, a2, a5
+; OPTV-NEXT:    addi a2, a2, 4
+; OPTV-NEXT:    addi a1, a1, 32
+; OPTV-NEXT:    add t2, a0, t2
+; OPTV-NEXT:    add t3, a0, t3
+; OPTV-NEXT:    sd a6, 0(t3)
+; OPTV-NEXT:    sd a7, 0(t2)
+; OPTV-NEXT:    sd t0, 80(t3)
+; OPTV-NEXT:    sd t1, 80(t2)
+; OPTV-NEXT:    addi a3, a3, 4
+; OPTV-NEXT:    bne a1, a4, .LBB13_1
+; OPTV-NEXT:  # %bb.2: # %bb18
+; OPTV-NEXT:    ret
 bb:
   br label %bb2
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
index 8eaa5efe163cd..d30e8b46e6df2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
@@ -28,6 +28,18 @@ define void @vpstore_v4i8(<4 x i8> %val, ptr %ptr, <4 x i1> %m, i32 zeroext %evl
   ret void
 }
 
+declare void @llvm.vp.store.v8i7.v8i7.p0(<8 x i7>, ptr, <8 x i1>, i32)
+
+define void @vpstore_v8i7(<8 x i7> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpstore_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.vp.store.v8i7.v8i7.p0(<8 x i7> %val, ptr %ptr, <8 x i1> %m, i32 %evl)
+  ret void
+}
+
 declare void @llvm.vp.store.v8i8.p0(<8 x i8>, ptr, <8 x i1>, i32)
 
 define void @vpstore_v8i8(<8 x i8> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
@@ -285,10 +297,10 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    bltu a1, a3, .LBB23_2
+; CHECK-NEXT:    bltu a1, a3, .LBB24_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:  .LBB23_2:
+; CHECK-NEXT:  .LBB24_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a2, a1, -16
diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
new file mode 100644
index 0000000000000..e70dcd16d02cd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll
@@ -0,0 +1,1387 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64
+
+define <vscale x 16 x i1> @match_nxv16i8_v1i8(<vscale x 16 x i8> %op1, <1 x i8> %op2, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: match_nxv16i8_v1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmand.mm v0, v10, v0
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <1 x i8> %op2, <vscale x 16 x i1> %mask)
+  ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v2i8(<vscale x 16 x i8> %op1, <2 x i8> %op2, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: match_nxv16i8_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslidedown.vi v10, v10, 1
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmor.mm v8, v11, v10
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <2 x i8> %op2, <vscale x 16 x i1> %mask)
+  ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v4i8(<vscale x 16 x i8> %op1, <4 x i8> %op2, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: match_nxv16i8_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslidedown.vi v11, v10, 1
+; CHECK-NEXT:    vslidedown.vi v12, v10, 2
+; CHECK-NEXT:    vslidedown.vi v10, v10, 3
+; CHECK-NEXT:    vmv.x.s a1, v11
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmor.mm v11, v11, v12
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vmseq.vx v11, v8, a1
+; CHECK-NEXT:    vmor.mm v8, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <4 x i8> %op2, <vscale x 16 x i1> %mask)
+  ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v8i8(<vscale x 16 x i8> %op1, <8 x i8> %op2, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: match_nxv16i8_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslidedown.vi v11, v10, 1
+; CHECK-NEXT:    vslidedown.vi v12, v10, 2
+; CHECK-NEXT:    vmv.x.s a1, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 3
+; CHECK-NEXT:    vmv.x.s a2, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 4
+; CHECK-NEXT:    vmv.x.s a3, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 5
+; CHECK-NEXT:    vmv.x.s a4, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 6
+; CHECK-NEXT:    vslidedown.vi v10, v10, 7
+; CHECK-NEXT:    vmv.x.s a5, v11
+; CHECK-NEXT:    vsetvli a6, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a2
+; CHECK-NEXT:    vmor.mm v11, v11, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a3
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vmseq.vx v11, v8, a4
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a5
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a1
+; CHECK-NEXT:    vmor.mm v8, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <8 x i8> %op2, <vscale x 16 x i1> %mask)
+  ret <vscale x 16 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v16i8(<vscale x 16 x i8> %op1, <16 x i8> %op2, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: match_nxv16i8_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslidedown.vi v11, v10, 1
+; CHECK-NEXT:    vslidedown.vi v12, v10, 2
+; CHECK-NEXT:    vmv.x.s a1, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 3
+; CHECK-NEXT:    vmv.x.s a2, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 4
+; CHECK-NEXT:    vmv.x.s a3, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 5
+; CHECK-NEXT:    vmv.x.s a4, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 6
+; CHECK-NEXT:    vmv.x.s a5, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 7
+; CHECK-NEXT:    vmv.x.s a6, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 8
+; CHECK-NEXT:    vmv.x.s a7, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 9
+; CHECK-NEXT:    vmv.x.s t0, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 10
+; CHECK-NEXT:    vmv.x.s t1, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 11
+; CHECK-NEXT:    vmv.x.s t2, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 12
+; CHECK-NEXT:    vmv.x.s t3, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 13
+; CHECK-NEXT:    vmv.x.s t4, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 14
+; CHECK-NEXT:    vslidedown.vi v10, v10, 15
+; CHECK-NEXT:    vmv.x.s t5, v11
+; CHECK-NEXT:    vsetvli t6, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a2
+; CHECK-NEXT:    vmor.mm v11, v11, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a3
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vmseq.vx v11, v8, a4
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a5
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a6
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a7
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, t0
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, t1
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, t2
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, t3
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, t4
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, t5
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a1
+; CHECK-NEXT:    vmor.mm v8, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <16 x i8> %op2, <vscale x 16 x i1> %mask)
+  ret <vscale x 16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v1i8(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask) {
+; CHECK-LABEL: match_v16i8_v1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vmseq.vx v8, v8, a0
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask)
+  ret <16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v2i8(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mask) {
+; CHECK-LABEL: match_v16i8_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vslidedown.vi v9, v9, 1
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vmseq.vx v8, v8, a0
+; CHECK-NEXT:    vmor.mm v8, v10, v8
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mask)
+  ret <16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v4i8(<16 x i8> %op1, <4 x i8> %op2, <16 x i1> %mask) {
+; CHECK-LABEL: match_v16i8_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v9, 1
+; CHECK-NEXT:    vslidedown.vi v11, v9, 2
+; CHECK-NEXT:    vslidedown.vi v9, v9, 3
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v9
+; CHECK-NEXT:    vmseq.vx v9, v8, a0
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmor.mm v9, v10, v9
+; CHECK-NEXT:    vmseq.vx v8, v8, a1
+; CHECK-NEXT:    vmor.mm v8, v9, v8
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <4 x i8> %op2, <16 x i1> %mask)
+  ret <16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v8i8(<16 x i8> %op1, <8 x i8> %op2, <16 x i1> %mask) {
+; CHECK-LABEL: match_v16i8_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v9, 1
+; CHECK-NEXT:    vslidedown.vi v11, v9, 2
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 3
+; CHECK-NEXT:    vmv.x.s a2, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 4
+; CHECK-NEXT:    vmv.x.s a3, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 5
+; CHECK-NEXT:    vmv.x.s a4, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 6
+; CHECK-NEXT:    vslidedown.vi v9, v9, 7
+; CHECK-NEXT:    vmv.x.s a5, v10
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v9
+; CHECK-NEXT:    vmseq.vx v9, v8, a2
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a3
+; CHECK-NEXT:    vmor.mm v9, v10, v9
+; CHECK-NEXT:    vmseq.vx v10, v8, a4
+; CHECK-NEXT:    vmor.mm v9, v9, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a5
+; CHECK-NEXT:    vmor.mm v9, v9, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmor.mm v9, v9, v11
+; CHECK-NEXT:    vmor.mm v9, v9, v10
+; CHECK-NEXT:    vmseq.vx v8, v8, a1
+; CHECK-NEXT:    vmor.mm v8, v9, v8
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <8 x i8> %op2, <16 x i1> %mask)
+  ret <16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) {
+; CHECK-LABEL: match_v16i8_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v9, 1
+; CHECK-NEXT:    vrgather.vi v11, v9, 0
+; CHECK-NEXT:    vmseq.vv v10, v8, v10
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vrgather.vi v11, v9, 2
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 3
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 4
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 5
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 6
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 7
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 8
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 9
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 10
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 11
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 12
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 13
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 14
+; CHECK-NEXT:    vrgather.vi v12, v9, 15
+; CHECK-NEXT:    vmseq.vv v9, v8, v11
+; CHECK-NEXT:    vmor.mm v9, v10, v9
+; CHECK-NEXT:    vmseq.vv v8, v8, v12
+; CHECK-NEXT:    vmor.mm v8, v9, v8
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
+  ret <16 x i1> %r
+}
+
+define <8 x i1> @match_v8i8_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) {
+; CHECK-LABEL: match_v8i8_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v9, 1
+; CHECK-NEXT:    vrgather.vi v11, v9, 0
+; CHECK-NEXT:    vmseq.vv v10, v8, v10
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vrgather.vi v11, v9, 2
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 3
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 4
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 5
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 6
+; CHECK-NEXT:    vrgather.vi v12, v9, 7
+; CHECK-NEXT:    vmseq.vv v9, v8, v11
+; CHECK-NEXT:    vmor.mm v9, v10, v9
+; CHECK-NEXT:    vmseq.vv v8, v8, v12
+; CHECK-NEXT:    vmor.mm v8, v9, v8
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask)
+  ret <8 x i1> %r
+}
+
+define <vscale x 8 x i1> @match_nxv8i16_v8i16(<vscale x 8 x i16> %op1, <8 x i16> %op2, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: match_nxv8i16_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslidedown.vi v11, v10, 1
+; CHECK-NEXT:    vslidedown.vi v12, v10, 2
+; CHECK-NEXT:    vmv.x.s a1, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 3
+; CHECK-NEXT:    vmv.x.s a2, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 4
+; CHECK-NEXT:    vmv.x.s a3, v11
+; CHECK-NEXT:    vslidedown.vi v11, v10, 5
+; CHECK-NEXT:    vmv.x.s a4, v12
+; CHECK-NEXT:    vslidedown.vi v12, v10, 6
+; CHECK-NEXT:    vslidedown.vi v10, v10, 7
+; CHECK-NEXT:    vmv.x.s a5, v11
+; CHECK-NEXT:    vsetvli a6, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a2
+; CHECK-NEXT:    vmor.mm v11, v11, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a3
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vmseq.vx v11, v8, a4
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a5
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmor.mm v10, v10, v12
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a1
+; CHECK-NEXT:    vmor.mm v8, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 8 x i1> @llvm.experimental.vector.match(<vscale x 8 x i16> %op1, <8 x i16> %op2, <vscale x 8 x i1> %mask)
+  ret <vscale x 8 x i1> %r
+}
+
+define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
+; CHECK-LABEL: match_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v9, 1
+; CHECK-NEXT:    vrgather.vi v11, v9, 0
+; CHECK-NEXT:    vmseq.vv v10, v8, v10
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vrgather.vi v11, v9, 2
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 3
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 4
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 5
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vrgather.vi v11, v9, 6
+; CHECK-NEXT:    vrgather.vi v12, v9, 7
+; CHECK-NEXT:    vmseq.vv v9, v8, v11
+; CHECK-NEXT:    vmor.mm v9, v10, v9
+; CHECK-NEXT:    vmseq.vv v8, v8, v12
+; CHECK-NEXT:    vmor.mm v8, v9, v8
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask)
+  ret <8 x i1> %r
+}
+
+; Cases where op2 has more elements than op1.
+
+define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) {
+; CHECK-LABEL: match_v8i8_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v9, 1
+; CHECK-NEXT:    vslidedown.vi v11, v9, 2
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 3
+; CHECK-NEXT:    vmv.x.s a2, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 4
+; CHECK-NEXT:    vmv.x.s a3, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 5
+; CHECK-NEXT:    vmv.x.s a4, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 6
+; CHECK-NEXT:    vmv.x.s a5, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 7
+; CHECK-NEXT:    vmv.x.s a6, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 8
+; CHECK-NEXT:    vmv.x.s a7, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 9
+; CHECK-NEXT:    vmv.x.s t0, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 10
+; CHECK-NEXT:    vmv.x.s t1, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 11
+; CHECK-NEXT:    vmv.x.s t2, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 12
+; CHECK-NEXT:    vmv.x.s t3, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 13
+; CHECK-NEXT:    vmv.x.s t4, v11
+; CHECK-NEXT:    vslidedown.vi v11, v9, 14
+; CHECK-NEXT:    vslidedown.vi v9, v9, 15
+; CHECK-NEXT:    vmv.x.s t5, v10
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v9
+; CHECK-NEXT:    vmseq.vx v9, v8, a2
+; CHECK-NEXT:    vmor.mm v10, v10, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a3
+; CHECK-NEXT:    vmor.mm v9, v10, v9
+; CHECK-NEXT:    vmseq.vx v10, v8, a4
+; CHECK-NEXT:    vmor.mm v9, v9, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a5
+; CHECK-NEXT:    vmor.mm v9, v9, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a6
+; CHECK-NEXT:    vmor.mm v9, v9, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, a7
+; CHECK-NEXT:    vmor.mm v9, v9, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, t0
+; CHECK-NEXT:    vmor.mm v9, v9, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, t1
+; CHECK-NEXT:    vmor.mm v9, v9, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, t2
+; CHECK-NEXT:    vmor.mm v9, v9, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, t3
+; CHECK-NEXT:    vmor.mm v9, v9, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, t4
+; CHECK-NEXT:    vmor.mm v9, v9, v11
+; CHECK-NEXT:    vmseq.vx v11, v8, t5
+; CHECK-NEXT:    vmor.mm v9, v9, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmor.mm v9, v9, v11
+; CHECK-NEXT:    vmor.mm v9, v9, v10
+; CHECK-NEXT:    vmseq.vx v8, v8, a1
+; CHECK-NEXT:    vmor.mm v8, v9, v8
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <8 x i1> @llvm.experimental.vector.match(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask)
+  ret <8 x i1> %r
+}
+
+define <vscale x 16 x i1> @match_nxv16i8_v32i8(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask) {
+; RV32-LABEL: match_nxv16i8_v32i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    .cfi_def_cfa_offset 64
+; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    .cfi_offset s2, -16
+; RV32-NEXT:    .cfi_offset s3, -20
+; RV32-NEXT:    .cfi_offset s4, -24
+; RV32-NEXT:    .cfi_offset s5, -28
+; RV32-NEXT:    .cfi_offset s6, -32
+; RV32-NEXT:    .cfi_offset s7, -36
+; RV32-NEXT:    .cfi_offset s8, -40
+; RV32-NEXT:    .cfi_offset s9, -44
+; RV32-NEXT:    .cfi_offset s10, -48
+; RV32-NEXT:    .cfi_offset s11, -52
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    vslidedown.vi v12, v10, 1
+; RV32-NEXT:    vslidedown.vi v13, v10, 2
+; RV32-NEXT:    vslidedown.vi v14, v10, 3
+; RV32-NEXT:    vslidedown.vi v15, v10, 4
+; RV32-NEXT:    vslidedown.vi v16, v10, 5
+; RV32-NEXT:    vslidedown.vi v17, v10, 6
+; RV32-NEXT:    vslidedown.vi v18, v10, 7
+; RV32-NEXT:    vslidedown.vi v19, v10, 8
+; RV32-NEXT:    vslidedown.vi v20, v10, 9
+; RV32-NEXT:    vslidedown.vi v21, v10, 10
+; RV32-NEXT:    vslidedown.vi v22, v10, 11
+; RV32-NEXT:    vslidedown.vi v23, v10, 12
+; RV32-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v10, 16
+; RV32-NEXT:    vmv.x.s a1, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 17
+; RV32-NEXT:    vmv.x.s a2, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 18
+; RV32-NEXT:    vmv.x.s a3, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 19
+; RV32-NEXT:    vmv.x.s a4, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 20
+; RV32-NEXT:    vmv.x.s a5, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 21
+; RV32-NEXT:    vmv.x.s a6, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 22
+; RV32-NEXT:    vmv.x.s a7, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 23
+; RV32-NEXT:    vmv.x.s t0, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 24
+; RV32-NEXT:    vmv.x.s t1, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 25
+; RV32-NEXT:    vmv.x.s t2, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 26
+; RV32-NEXT:    vmv.x.s t3, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 27
+; RV32-NEXT:    vmv.x.s t4, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 28
+; RV32-NEXT:    vmv.x.s t5, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 29
+; RV32-NEXT:    vmv.x.s t6, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 30
+; RV32-NEXT:    vmv.x.s s0, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 31
+; RV32-NEXT:    vmv.x.s s1, v24
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v11, v10, 13
+; RV32-NEXT:    vslidedown.vi v24, v10, 14
+; RV32-NEXT:    vslidedown.vi v10, v10, 15
+; RV32-NEXT:    vmv.x.s s2, v12
+; RV32-NEXT:    vmv.x.s s3, v13
+; RV32-NEXT:    vmv.x.s s4, v14
+; RV32-NEXT:    vmv.x.s s5, v15
+; RV32-NEXT:    vmv.x.s s6, v16
+; RV32-NEXT:    vmv.x.s s7, v17
+; RV32-NEXT:    vmv.x.s s8, v18
+; RV32-NEXT:    vmv.x.s s9, v19
+; RV32-NEXT:    vmv.x.s s10, v20
+; RV32-NEXT:    vmv.x.s s11, v21
+; RV32-NEXT:    vmv.x.s ra, v22
+; RV32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV32-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    vmseq.vx v12, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v23
+; RV32-NEXT:    vmseq.vx v13, v8, s2
+; RV32-NEXT:    vmv.x.s s2, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s3
+; RV32-NEXT:    vmv.x.s s3, v24
+; RV32-NEXT:    vmseq.vx v14, v8, s4
+; RV32-NEXT:    vmv.x.s s4, v10
+; RV32-NEXT:    vmseq.vx v10, v8, s5
+; RV32-NEXT:    vmor.mm v12, v12, v13
+; RV32-NEXT:    vmseq.vx v13, v8, s6
+; RV32-NEXT:    vmor.mm v11, v12, v11
+; RV32-NEXT:    vmseq.vx v12, v8, s7
+; RV32-NEXT:    vmor.mm v11, v11, v14
+; RV32-NEXT:    vmseq.vx v14, v8, s8
+; RV32-NEXT:    vmor.mm v10, v11, v10
+; RV32-NEXT:    vmseq.vx v11, v8, s9
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, s10
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s11
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, ra
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a0
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, s2
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s3
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, s4
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a1
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, a2
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, a3
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, a4
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a5
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, a6
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, a7
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, t0
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, t1
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, t2
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, t3
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmseq.vx v14, v8, t4
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmseq.vx v11, v8, t5
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmseq.vx v13, v8, t6
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s0
+; RV32-NEXT:    vmor.mm v10, v10, v14
+; RV32-NEXT:    vmor.mm v10, v10, v11
+; RV32-NEXT:    vmor.mm v10, v10, v13
+; RV32-NEXT:    vmor.mm v10, v10, v12
+; RV32-NEXT:    vmseq.vx v11, v8, s1
+; RV32-NEXT:    vmor.mm v8, v10, v11
+; RV32-NEXT:    vmand.mm v0, v8, v0
+; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    .cfi_restore s4
+; RV32-NEXT:    .cfi_restore s5
+; RV32-NEXT:    .cfi_restore s6
+; RV32-NEXT:    .cfi_restore s7
+; RV32-NEXT:    .cfi_restore s8
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    .cfi_restore s10
+; RV32-NEXT:    .cfi_restore s11
+; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: match_nxv16i8_v32i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -112
+; RV64-NEXT:    .cfi_def_cfa_offset 112
+; RV64-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    .cfi_offset s2, -32
+; RV64-NEXT:    .cfi_offset s3, -40
+; RV64-NEXT:    .cfi_offset s4, -48
+; RV64-NEXT:    .cfi_offset s5, -56
+; RV64-NEXT:    .cfi_offset s6, -64
+; RV64-NEXT:    .cfi_offset s7, -72
+; RV64-NEXT:    .cfi_offset s8, -80
+; RV64-NEXT:    .cfi_offset s9, -88
+; RV64-NEXT:    .cfi_offset s10, -96
+; RV64-NEXT:    .cfi_offset s11, -104
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    vslidedown.vi v12, v10, 1
+; RV64-NEXT:    vslidedown.vi v13, v10, 2
+; RV64-NEXT:    vslidedown.vi v14, v10, 3
+; RV64-NEXT:    vslidedown.vi v15, v10, 4
+; RV64-NEXT:    vslidedown.vi v16, v10, 5
+; RV64-NEXT:    vslidedown.vi v17, v10, 6
+; RV64-NEXT:    vslidedown.vi v18, v10, 7
+; RV64-NEXT:    vslidedown.vi v19, v10, 8
+; RV64-NEXT:    vslidedown.vi v20, v10, 9
+; RV64-NEXT:    vslidedown.vi v21, v10, 10
+; RV64-NEXT:    vslidedown.vi v22, v10, 11
+; RV64-NEXT:    vslidedown.vi v23, v10, 12
+; RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v10, 16
+; RV64-NEXT:    vmv.x.s a1, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 17
+; RV64-NEXT:    vmv.x.s a2, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 18
+; RV64-NEXT:    vmv.x.s a3, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 19
+; RV64-NEXT:    vmv.x.s a4, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 20
+; RV64-NEXT:    vmv.x.s a5, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 21
+; RV64-NEXT:    vmv.x.s a6, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 22
+; RV64-NEXT:    vmv.x.s a7, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 23
+; RV64-NEXT:    vmv.x.s t0, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 24
+; RV64-NEXT:    vmv.x.s t1, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 25
+; RV64-NEXT:    vmv.x.s t2, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 26
+; RV64-NEXT:    vmv.x.s t3, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 27
+; RV64-NEXT:    vmv.x.s t4, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 28
+; RV64-NEXT:    vmv.x.s t5, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 29
+; RV64-NEXT:    vmv.x.s t6, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 30
+; RV64-NEXT:    vmv.x.s s0, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 31
+; RV64-NEXT:    vmv.x.s s1, v24
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v11, v10, 13
+; RV64-NEXT:    vslidedown.vi v24, v10, 14
+; RV64-NEXT:    vslidedown.vi v10, v10, 15
+; RV64-NEXT:    vmv.x.s s2, v12
+; RV64-NEXT:    vmv.x.s s3, v13
+; RV64-NEXT:    vmv.x.s s4, v14
+; RV64-NEXT:    vmv.x.s s5, v15
+; RV64-NEXT:    vmv.x.s s6, v16
+; RV64-NEXT:    vmv.x.s s7, v17
+; RV64-NEXT:    vmv.x.s s8, v18
+; RV64-NEXT:    vmv.x.s s9, v19
+; RV64-NEXT:    vmv.x.s s10, v20
+; RV64-NEXT:    vmv.x.s s11, v21
+; RV64-NEXT:    vmv.x.s ra, v22
+; RV64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    vmseq.vx v12, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v23
+; RV64-NEXT:    vmseq.vx v13, v8, s2
+; RV64-NEXT:    vmv.x.s s2, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s3
+; RV64-NEXT:    vmv.x.s s3, v24
+; RV64-NEXT:    vmseq.vx v14, v8, s4
+; RV64-NEXT:    vmv.x.s s4, v10
+; RV64-NEXT:    vmseq.vx v10, v8, s5
+; RV64-NEXT:    vmor.mm v12, v12, v13
+; RV64-NEXT:    vmseq.vx v13, v8, s6
+; RV64-NEXT:    vmor.mm v11, v12, v11
+; RV64-NEXT:    vmseq.vx v12, v8, s7
+; RV64-NEXT:    vmor.mm v11, v11, v14
+; RV64-NEXT:    vmseq.vx v14, v8, s8
+; RV64-NEXT:    vmor.mm v10, v11, v10
+; RV64-NEXT:    vmseq.vx v11, v8, s9
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, s10
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s11
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, ra
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a0
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, s2
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s3
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, s4
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a1
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, a2
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, a3
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, a4
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a5
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, a6
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, a7
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, t0
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, t1
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, t2
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, t3
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmseq.vx v14, v8, t4
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmseq.vx v11, v8, t5
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmseq.vx v13, v8, t6
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s0
+; RV64-NEXT:    vmor.mm v10, v10, v14
+; RV64-NEXT:    vmor.mm v10, v10, v11
+; RV64-NEXT:    vmor.mm v10, v10, v13
+; RV64-NEXT:    vmor.mm v10, v10, v12
+; RV64-NEXT:    vmseq.vx v11, v8, s1
+; RV64-NEXT:    vmor.mm v8, v10, v11
+; RV64-NEXT:    vmand.mm v0, v8, v0
+; RV64-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    .cfi_restore s3
+; RV64-NEXT:    .cfi_restore s4
+; RV64-NEXT:    .cfi_restore s5
+; RV64-NEXT:    .cfi_restore s6
+; RV64-NEXT:    .cfi_restore s7
+; RV64-NEXT:    .cfi_restore s8
+; RV64-NEXT:    .cfi_restore s9
+; RV64-NEXT:    .cfi_restore s10
+; RV64-NEXT:    .cfi_restore s11
+; RV64-NEXT:    addi sp, sp, 112
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %r = tail call <vscale x 16 x i1> @llvm.experimental.vector.match(<vscale x 16 x i8> %op1, <32 x i8> %op2, <vscale x 16 x i1> %mask)
+  ret <vscale x 16 x i1> %r
+}
+
+define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) {
+; RV32-LABEL: match_v16i8_v32i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    .cfi_def_cfa_offset 64
+; RV32-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    .cfi_offset s2, -16
+; RV32-NEXT:    .cfi_offset s3, -20
+; RV32-NEXT:    .cfi_offset s4, -24
+; RV32-NEXT:    .cfi_offset s5, -28
+; RV32-NEXT:    .cfi_offset s6, -32
+; RV32-NEXT:    .cfi_offset s7, -36
+; RV32-NEXT:    .cfi_offset s8, -40
+; RV32-NEXT:    .cfi_offset s9, -44
+; RV32-NEXT:    .cfi_offset s10, -48
+; RV32-NEXT:    .cfi_offset s11, -52
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    vslidedown.vi v9, v10, 1
+; RV32-NEXT:    vslidedown.vi v12, v10, 2
+; RV32-NEXT:    vslidedown.vi v13, v10, 3
+; RV32-NEXT:    vslidedown.vi v14, v10, 4
+; RV32-NEXT:    vslidedown.vi v15, v10, 5
+; RV32-NEXT:    vslidedown.vi v16, v10, 6
+; RV32-NEXT:    vslidedown.vi v17, v10, 7
+; RV32-NEXT:    vslidedown.vi v18, v10, 8
+; RV32-NEXT:    vslidedown.vi v19, v10, 9
+; RV32-NEXT:    vslidedown.vi v20, v10, 10
+; RV32-NEXT:    vslidedown.vi v21, v10, 11
+; RV32-NEXT:    vslidedown.vi v22, v10, 12
+; RV32-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v10, 16
+; RV32-NEXT:    vmv.x.s a1, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 17
+; RV32-NEXT:    vmv.x.s a2, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 18
+; RV32-NEXT:    vmv.x.s a3, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 19
+; RV32-NEXT:    vmv.x.s a4, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 20
+; RV32-NEXT:    vmv.x.s a5, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 21
+; RV32-NEXT:    vmv.x.s a6, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 22
+; RV32-NEXT:    vmv.x.s a7, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 23
+; RV32-NEXT:    vmv.x.s t0, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 24
+; RV32-NEXT:    vmv.x.s t1, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 25
+; RV32-NEXT:    vmv.x.s t2, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 26
+; RV32-NEXT:    vmv.x.s t3, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 27
+; RV32-NEXT:    vmv.x.s t4, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 28
+; RV32-NEXT:    vmv.x.s t5, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 29
+; RV32-NEXT:    vmv.x.s t6, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 30
+; RV32-NEXT:    vmv.x.s s0, v24
+; RV32-NEXT:    vslidedown.vi v24, v10, 31
+; RV32-NEXT:    vmv.x.s s1, v24
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v11, v10, 13
+; RV32-NEXT:    vslidedown.vi v23, v10, 14
+; RV32-NEXT:    vslidedown.vi v10, v10, 15
+; RV32-NEXT:    vmv.x.s s2, v9
+; RV32-NEXT:    vmv.x.s s3, v12
+; RV32-NEXT:    vmv.x.s s4, v13
+; RV32-NEXT:    vmv.x.s s5, v14
+; RV32-NEXT:    vmv.x.s s6, v15
+; RV32-NEXT:    vmv.x.s s7, v16
+; RV32-NEXT:    vmv.x.s s8, v17
+; RV32-NEXT:    vmv.x.s s9, v18
+; RV32-NEXT:    vmv.x.s s10, v19
+; RV32-NEXT:    vmv.x.s s11, v20
+; RV32-NEXT:    vmv.x.s ra, v21
+; RV32-NEXT:    vmseq.vx v9, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v22
+; RV32-NEXT:    vmseq.vx v12, v8, s2
+; RV32-NEXT:    vmv.x.s s2, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s3
+; RV32-NEXT:    vmv.x.s s3, v23
+; RV32-NEXT:    vmseq.vx v13, v8, s4
+; RV32-NEXT:    vmv.x.s s4, v10
+; RV32-NEXT:    vmseq.vx v10, v8, s5
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s6
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s7
+; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmseq.vx v13, v8, s8
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vx v10, v8, s9
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s10
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s11
+; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmseq.vx v13, v8, ra
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vx v10, v8, a0
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, s2
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s3
+; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmseq.vx v13, v8, s4
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vx v10, v8, a1
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, a2
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a3
+; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmseq.vx v13, v8, a4
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vx v10, v8, a5
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, a6
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, a7
+; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmseq.vx v13, v8, t0
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vx v10, v8, t1
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, t2
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, t3
+; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmseq.vx v13, v8, t4
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmseq.vx v10, v8, t5
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmseq.vx v12, v8, t6
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v11, v8, s0
+; RV32-NEXT:    vmor.mm v9, v9, v13
+; RV32-NEXT:    vmor.mm v9, v9, v10
+; RV32-NEXT:    vmor.mm v9, v9, v12
+; RV32-NEXT:    vmor.mm v9, v9, v11
+; RV32-NEXT:    vmseq.vx v8, v8, s1
+; RV32-NEXT:    vmor.mm v8, v9, v8
+; RV32-NEXT:    vmand.mm v0, v8, v0
+; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    .cfi_restore s4
+; RV32-NEXT:    .cfi_restore s5
+; RV32-NEXT:    .cfi_restore s6
+; RV32-NEXT:    .cfi_restore s7
+; RV32-NEXT:    .cfi_restore s8
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    .cfi_restore s10
+; RV32-NEXT:    .cfi_restore s11
+; RV32-NEXT:    addi sp, sp, 64
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: match_v16i8_v32i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -112
+; RV64-NEXT:    .cfi_def_cfa_offset 112
+; RV64-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    .cfi_offset s2, -32
+; RV64-NEXT:    .cfi_offset s3, -40
+; RV64-NEXT:    .cfi_offset s4, -48
+; RV64-NEXT:    .cfi_offset s5, -56
+; RV64-NEXT:    .cfi_offset s6, -64
+; RV64-NEXT:    .cfi_offset s7, -72
+; RV64-NEXT:    .cfi_offset s8, -80
+; RV64-NEXT:    .cfi_offset s9, -88
+; RV64-NEXT:    .cfi_offset s10, -96
+; RV64-NEXT:    .cfi_offset s11, -104
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    vslidedown.vi v9, v10, 1
+; RV64-NEXT:    vslidedown.vi v12, v10, 2
+; RV64-NEXT:    vslidedown.vi v13, v10, 3
+; RV64-NEXT:    vslidedown.vi v14, v10, 4
+; RV64-NEXT:    vslidedown.vi v15, v10, 5
+; RV64-NEXT:    vslidedown.vi v16, v10, 6
+; RV64-NEXT:    vslidedown.vi v17, v10, 7
+; RV64-NEXT:    vslidedown.vi v18, v10, 8
+; RV64-NEXT:    vslidedown.vi v19, v10, 9
+; RV64-NEXT:    vslidedown.vi v20, v10, 10
+; RV64-NEXT:    vslidedown.vi v21, v10, 11
+; RV64-NEXT:    vslidedown.vi v22, v10, 12
+; RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v10, 16
+; RV64-NEXT:    vmv.x.s a1, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 17
+; RV64-NEXT:    vmv.x.s a2, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 18
+; RV64-NEXT:    vmv.x.s a3, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 19
+; RV64-NEXT:    vmv.x.s a4, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 20
+; RV64-NEXT:    vmv.x.s a5, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 21
+; RV64-NEXT:    vmv.x.s a6, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 22
+; RV64-NEXT:    vmv.x.s a7, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 23
+; RV64-NEXT:    vmv.x.s t0, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 24
+; RV64-NEXT:    vmv.x.s t1, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 25
+; RV64-NEXT:    vmv.x.s t2, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 26
+; RV64-NEXT:    vmv.x.s t3, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 27
+; RV64-NEXT:    vmv.x.s t4, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 28
+; RV64-NEXT:    vmv.x.s t5, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 29
+; RV64-NEXT:    vmv.x.s t6, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 30
+; RV64-NEXT:    vmv.x.s s0, v24
+; RV64-NEXT:    vslidedown.vi v24, v10, 31
+; RV64-NEXT:    vmv.x.s s1, v24
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v11, v10, 13
+; RV64-NEXT:    vslidedown.vi v23, v10, 14
+; RV64-NEXT:    vslidedown.vi v10, v10, 15
+; RV64-NEXT:    vmv.x.s s2, v9
+; RV64-NEXT:    vmv.x.s s3, v12
+; RV64-NEXT:    vmv.x.s s4, v13
+; RV64-NEXT:    vmv.x.s s5, v14
+; RV64-NEXT:    vmv.x.s s6, v15
+; RV64-NEXT:    vmv.x.s s7, v16
+; RV64-NEXT:    vmv.x.s s8, v17
+; RV64-NEXT:    vmv.x.s s9, v18
+; RV64-NEXT:    vmv.x.s s10, v19
+; RV64-NEXT:    vmv.x.s s11, v20
+; RV64-NEXT:    vmv.x.s ra, v21
+; RV64-NEXT:    vmseq.vx v9, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v22
+; RV64-NEXT:    vmseq.vx v12, v8, s2
+; RV64-NEXT:    vmv.x.s s2, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s3
+; RV64-NEXT:    vmv.x.s s3, v23
+; RV64-NEXT:    vmseq.vx v13, v8, s4
+; RV64-NEXT:    vmv.x.s s4, v10
+; RV64-NEXT:    vmseq.vx v10, v8, s5
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s6
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s7
+; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmseq.vx v13, v8, s8
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vx v10, v8, s9
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s10
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s11
+; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmseq.vx v13, v8, ra
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vx v10, v8, a0
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, s2
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s3
+; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmseq.vx v13, v8, s4
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vx v10, v8, a1
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, a2
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a3
+; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmseq.vx v13, v8, a4
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vx v10, v8, a5
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, a6
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, a7
+; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmseq.vx v13, v8, t0
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vx v10, v8, t1
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, t2
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, t3
+; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmseq.vx v13, v8, t4
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmseq.vx v10, v8, t5
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmseq.vx v12, v8, t6
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v11, v8, s0
+; RV64-NEXT:    vmor.mm v9, v9, v13
+; RV64-NEXT:    vmor.mm v9, v9, v10
+; RV64-NEXT:    vmor.mm v9, v9, v12
+; RV64-NEXT:    vmor.mm v9, v9, v11
+; RV64-NEXT:    vmseq.vx v8, v8, s1
+; RV64-NEXT:    vmor.mm v8, v9, v8
+; RV64-NEXT:    vmand.mm v0, v8, v0
+; RV64-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    .cfi_restore s3
+; RV64-NEXT:    .cfi_restore s4
+; RV64-NEXT:    .cfi_restore s5
+; RV64-NEXT:    .cfi_restore s6
+; RV64-NEXT:    .cfi_restore s7
+; RV64-NEXT:    .cfi_restore s8
+; RV64-NEXT:    .cfi_restore s9
+; RV64-NEXT:    .cfi_restore s10
+; RV64-NEXT:    .cfi_restore s11
+; RV64-NEXT:    addi sp, sp, 112
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask)
+  ret <16 x i1> %r
+}
+
+define <vscale x 4 x i1> @match_nxv4xi32_v4i32(<vscale x 4 x i32> %op1, <4 x i32> %op2, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: match_nxv4xi32_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vslidedown.vi v11, v10, 1
+; CHECK-NEXT:    vslidedown.vi v12, v10, 2
+; CHECK-NEXT:    vslidedown.vi v10, v10, 3
+; CHECK-NEXT:    vmv.x.s a1, v11
+; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmseq.vx v11, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v12
+; CHECK-NEXT:    vmseq.vx v12, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v10
+; CHECK-NEXT:    vmseq.vx v10, v8, a0
+; CHECK-NEXT:    vmor.mm v11, v11, v12
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vmseq.vx v11, v8, a1
+; CHECK-NEXT:    vmor.mm v8, v10, v11
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <vscale x 4 x i1> @llvm.experimental.vector.match(<vscale x 4 x i32> %op1, <4 x i32> %op2, <vscale x 4 x i1> %mask)
+  ret <vscale x 4 x i1> %r
+}
+
+define <vscale x 2 x i1> @match_nxv2xi64_v2i64(<vscale x 2 x i64> %op1, <2 x i64> %op2, <vscale x 2 x i1> %mask) {
+; RV32-LABEL: match_nxv2xi64_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vslidedown.vi v11, v10, 1
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    vsrl.vx v10, v10, a1
+; RV32-NEXT:    vmv.x.s a3, v11
+; RV32-NEXT:    vsrl.vx v11, v11, a1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    vmv.x.s a0, v11
+; RV32-NEXT:    sw a3, 0(sp)
+; RV32-NEXT:    sw a0, 4(sp)
+; RV32-NEXT:    mv a0, sp
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a2), zero
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vmseq.vv v14, v8, v10
+; RV32-NEXT:    vmseq.vv v10, v8, v12
+; RV32-NEXT:    vmor.mm v8, v14, v10
+; RV32-NEXT:    vmand.mm v0, v8, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: match_nxv2xi64_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    vslidedown.vi v10, v10, 1
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmseq.vx v11, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    vmseq.vx v10, v8, a0
+; RV64-NEXT:    vmor.mm v8, v11, v10
+; RV64-NEXT:    vmand.mm v0, v8, v0
+; RV64-NEXT:    ret
+  %r = tail call <vscale x 2 x i1> @llvm.experimental.vector.match(<vscale x 2 x i64> %op1, <2 x i64> %op2, <vscale x 2 x i1> %mask)
+  ret <vscale x 2 x i1> %r
+}
+
+define <4 x i1> @match_v4xi32_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
+; CHECK-LABEL: match_v4xi32_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v9, 1
+; CHECK-NEXT:    vrgather.vi v11, v9, 0
+; CHECK-NEXT:    vmseq.vv v10, v8, v10
+; CHECK-NEXT:    vmseq.vv v11, v8, v11
+; CHECK-NEXT:    vmor.mm v10, v11, v10
+; CHECK-NEXT:    vrgather.vi v11, v9, 2
+; CHECK-NEXT:    vrgather.vi v12, v9, 3
+; CHECK-NEXT:    vmseq.vv v9, v8, v11
+; CHECK-NEXT:    vmor.mm v9, v10, v9
+; CHECK-NEXT:    vmseq.vv v8, v8, v12
+; CHECK-NEXT:    vmor.mm v8, v9, v8
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <4 x i1> @llvm.experimental.vector.match(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask)
+  ret <4 x i1> %r
+}
+
+define <2 x i1> @match_v2xi64_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
+; CHECK-LABEL: match_v2xi64_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v9, 1
+; CHECK-NEXT:    vrgather.vi v11, v9, 0
+; CHECK-NEXT:    vmseq.vv v9, v8, v10
+; CHECK-NEXT:    vmseq.vv v8, v8, v11
+; CHECK-NEXT:    vmor.mm v8, v8, v9
+; CHECK-NEXT:    vmand.mm v0, v8, v0
+; CHECK-NEXT:    ret
+  %r = tail call <2 x i1> @llvm.experimental.vector.match(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask)
+  ret <2 x i1> %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
new file mode 100644
index 0000000000000..1eef183db21bb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
@@ -0,0 +1,378 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64
+
+define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
+; CHECK-LABEL: extract_last_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    beqz a1, .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    ret
+  %notzero = icmp ne <16 x i8> %mask, zeroinitializer
+  %res = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %notzero, i8 %passthru)
+  ret i8 %res
+}
+
+define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
+; CHECK-LABEL: extract_last_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    beqz a1, .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    ret
+  %notzero = icmp ne <8 x i16> %mask, zeroinitializer
+  %res = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %notzero, i16 %passthru)
+  ret i16 %res
+}
+
+define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
+; CHECK-LABEL: extract_last_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    beqz a1, .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    ret
+  %notzero = icmp ne <4 x i32> %mask, zeroinitializer
+  %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %notzero, i32 %passthru)
+  ret i32 %res
+}
+
+define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
+; RV32-LABEL: extract_last_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vmsne.vi v0, v9, 0
+; RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    vcpop.m a2, v0
+; RV32-NEXT:    vid.v v9, v0.t
+; RV32-NEXT:    beqz a2, .LBB3_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    vredmaxu.vs v9, v9, v9
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    andi a0, a0, 255
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:  .LBB3_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: extract_last_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v9, 0
+; RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
+; RV64-NEXT:    vmv.v.i v9, 0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vid.v v9, v0.t
+; RV64-NEXT:    beqz a1, .LBB3_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    vredmaxu.vs v9, v9, v9
+; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    andi a0, a0, 255
+; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:  .LBB3_2:
+; RV64-NEXT:    ret
+  %notzero = icmp ne <2 x i64> %mask, zeroinitializer
+  %res = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %notzero, i64 %passthru)
+  ret i64 %res
+}
+
+define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %passthru) {
+; CHECK-LABEL: extract_last_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vcpop.m a0, v0
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    beqz a0, .LBB4_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    ret
+  %notzero = icmp ne <4 x i32> %mask, zeroinitializer
+  %res = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> %data, <4 x i1> %notzero, float %passthru)
+  ret float %res
+}
+
+define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %passthru) {
+; CHECK-LABEL: extract_last_double:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vcpop.m a0, v0
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    beqz a0, .LBB5_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    ret
+  %notzero = icmp ne <2 x i64> %mask, zeroinitializer
+  %res = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> %data, <2 x i1> %notzero, double %passthru)
+  ret double %res
+}
+
+define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) {
+; CHECK-LABEL: extract_last_i8_scalable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, mu
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    beqz a1, .LBB6_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vredmaxu.vs v10, v10, v10
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
+  ret i8 %res
+}
+
+define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) {
+; RV32-LABEL: extract_last_i16_scalable:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vid.v v10, v0.t
+; RV32-NEXT:    beqz a1, .LBB7_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    vredmaxu.vs v10, v10, v10
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:  .LBB7_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: extract_last_i16_scalable:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vid.v v10, v0.t
+; RV64-NEXT:    beqz a1, .LBB7_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    vredmaxu.vs v10, v10, v10
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:  .LBB7_2:
+; RV64-NEXT:    ret
+  %res = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
+  ret i16 %res
+}
+
+define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) {
+; RV32-LABEL: extract_last_i32_scalable:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, mu
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vid.v v10, v0.t
+; RV32-NEXT:    beqz a1, .LBB8_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    vredmaxu.vs v10, v10, v10
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:  .LBB8_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: extract_last_i32_scalable:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, mu
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vid.v v10, v0.t
+; RV64-NEXT:    beqz a1, .LBB8_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    vredmaxu.vs v10, v10, v10
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:  .LBB8_2:
+; RV64-NEXT:    ret
+  %res = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
+  ret i32 %res
+}
+
+define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) {
+; RV32-LABEL: extract_last_i64_scalable:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e64, m2, ta, mu
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vcpop.m a2, v0
+; RV32-NEXT:    vid.v v10, v0.t
+; RV32-NEXT:    beqz a2, .LBB9_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    vredmaxu.vs v10, v10, v10
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:  .LBB9_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: extract_last_i64_scalable:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vid.v v10, v0.t
+; RV64-NEXT:    beqz a1, .LBB9_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    vredmaxu.vs v10, v10, v10
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:  .LBB9_2:
+; RV64-NEXT:    ret
+  %res = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru)
+  ret i64 %res
+}
+
+define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru) {
+; RV32-LABEL: extract_last_float_scalable:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vcpop.m a0, v0
+; RV32-NEXT:    vid.v v10, v0.t
+; RV32-NEXT:    beqz a0, .LBB10_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    vredmaxu.vs v10, v10, v10
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-NEXT:    vfmv.f.s fa0, v8
+; RV32-NEXT:  .LBB10_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: extract_last_float_scalable:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vcpop.m a0, v0
+; RV64-NEXT:    vid.v v10, v0.t
+; RV64-NEXT:    beqz a0, .LBB10_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    vredmaxu.vs v10, v10, v10
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
+; RV64-NEXT:    vfmv.f.s fa0, v8
+; RV64-NEXT:  .LBB10_2:
+; RV64-NEXT:    ret
+  %res = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru)
+  ret float %res
+}
+
+define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru) {
+; CHECK-LABEL: extract_last_double_scalable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vcpop.m a0, v0
+; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    beqz a0, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vredmaxu.vs v10, v10, v10
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    ret
+  %res = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru)
+  ret double %res
+}
+
+declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
+declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
+declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
+declare i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64)
+declare float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float>, <4 x i1>, float)
+declare double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double>, <2 x i1>, double)
+declare i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
+declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
+declare i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
+declare float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
+declare double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index 8dfab72d008c2..bd7ea6c19d0b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -65,6 +65,18 @@ define <vscale x 3 x i8> @vpload_nxv3i8(ptr %ptr, <vscale x 3 x i1> %m, i32 zero
   ret <vscale x 3 x i8> %load
 }
 
+declare <vscale x 4 x i6> @llvm.vp.load.nxv4i6.nxv4i6.p0(<vscale x 4 x i6>*, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i6> @vpload_nxv4i6(<vscale x 4 x i6>* %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpload_nxv4i6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i6> @llvm.vp.load.nxv4i6.nxv4i6.p0(<vscale x 4 x i6>* %ptr, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i6> %load
+}
+
 declare <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i8> @vpload_nxv4i8(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
@@ -523,10 +535,10 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a4), v0.t
-; CHECK-NEXT:    bltu a1, a2, .LBB43_2
+; CHECK-NEXT:    bltu a1, a2, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
-; CHECK-NEXT:  .LBB43_2:
+; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
@@ -553,10 +565,10 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a5, a3, 1
 ; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    bltu a2, a5, .LBB44_2
+; CHECK-NEXT:    bltu a2, a5, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a5
-; CHECK-NEXT:  .LBB44_2:
+; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    sub a6, a4, a3
 ; CHECK-NEXT:    slli a7, a3, 3
 ; CHECK-NEXT:    srli t0, a3, 3
@@ -572,10 +584,10 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    sltu a2, a2, a5
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
-; CHECK-NEXT:    bltu a2, a3, .LBB44_4
+; CHECK-NEXT:    bltu a2, a3, .LBB45_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a3
-; CHECK-NEXT:  .LBB44_4:
+; CHECK-NEXT:  .LBB45_4:
 ; CHECK-NEXT:    slli a5, a3, 4
 ; CHECK-NEXT:    srli a6, a3, 2
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
@@ -583,10 +595,10 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a5), v0.t
-; CHECK-NEXT:    bltu a4, a3, .LBB44_6
+; CHECK-NEXT:    bltu a4, a3, .LBB45_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:  .LBB44_6:
+; CHECK-NEXT:  .LBB45_6:
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 549f57a01f38f..8978dc268d4e5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -104,6 +104,18 @@ define void @vpstore_nxv4i16(<vscale x 4 x i16> %val, ptr %ptr, <vscale x 4 x i1
   ret void
 }
 
+declare void @llvm.vp.store.nxv8i12.nxv8i12.p0(<vscale x 8 x i12>, <vscale x 8 x i12>*, <vscale x 8 x i1>, i32)
+
+define void @vpstore_nxv8i12(<vscale x 8 x i12> %val, <vscale x 8 x i12>* %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpstore_nxv8i12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  call void @llvm.vp.store.nxv8i12.nxv8i12.p0(<vscale x 8 x i12> %val, <vscale x 8 x i12>* %ptr, <vscale x 8 x i1> %m, i32 %evl)
+  ret void
+}
+
 declare void @llvm.vp.store.nxv8i16.p0(<vscale x 8 x i16>, ptr, <vscale x 8 x i1>, i32)
 
 define void @vpstore_nxv8i16(<vscale x 8 x i16> %val, ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
@@ -421,10 +433,10 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    bltu a1, a2, .LBB34_2
+; CHECK-NEXT:    bltu a1, a2, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:  .LBB34_2:
+; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
 ; CHECK-NEXT:    srli a3, a2, 3
@@ -462,15 +474,15 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a4, a3, 1
 ; CHECK-NEXT:    mv a5, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB35_2
+; CHECK-NEXT:    bltu a2, a4, .LBB36_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:  .LBB35_2:
+; CHECK-NEXT:  .LBB36_2:
 ; CHECK-NEXT:    mv a6, a5
-; CHECK-NEXT:    bltu a5, a3, .LBB35_4
+; CHECK-NEXT:    bltu a5, a3, .LBB36_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a6, a3
-; CHECK-NEXT:  .LBB35_4:
+; CHECK-NEXT:  .LBB36_4:
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
@@ -492,10 +504,10 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a6), v0.t
-; CHECK-NEXT:    bltu a0, a3, .LBB35_6
+; CHECK-NEXT:    bltu a0, a3, .LBB36_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a3
-; CHECK-NEXT:  .LBB35_6:
+; CHECK-NEXT:  .LBB36_6:
 ; CHECK-NEXT:    slli a2, a3, 4
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll
index 18bd41a210f53..81a8a8065e6b6 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zdinx -verify-machineinstrs < %s \
 ; RUN:   -target-abi=ilp32 -mattr=+zhinx | FileCheck %s
 
-;; These tests cover the use of `r` and `cr` constraints for floating point values on rv32.
+;; These tests cover the use of `r`, `R`, and `cr` constraints for floating point values on rv32.
 ;;
 ;; In particular, there is significant complexity around using paired GPRs for double values on rv32.
 
@@ -26,6 +26,62 @@ entry:
   ret void
 }
 
+define dso_local void @zdinx_asm_R(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind {
+; CHECK-LABEL: zdinx_asm_R:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    mv a7, a2
+; CHECK-NEXT:    mv a4, a3
+; CHECK-NEXT:    mv a6, a1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    fsgnjx.d a2, a6, a4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    sw a2, 8(a0)
+; CHECK-NEXT:    sw a3, 12(a0)
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx = getelementptr inbounds double, ptr %a, i32 1
+  %0 = tail call double asm "fsgnjx.d $0, $1, $2", "=R,R,R"(double %b, double %c)
+  store double %0, ptr %arrayidx, align 8
+  ret void
+}
+
+define dso_local void @zdinx_asm_inout(ptr nocapture noundef writeonly %a, double noundef %b) nounwind {
+; CHECK-LABEL: zdinx_asm_inout:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    fmv.d a2, a2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    sw a2, 8(a0)
+; CHECK-NEXT:    sw a3, 12(a0)
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx = getelementptr inbounds double, ptr %a, i32 1
+  %0 = tail call double asm "fsgnj.d $0, $1, $1", "=r,0"(double %b)
+  store double %0, ptr %arrayidx, align 8
+  ret void
+}
+
+define dso_local void @zdinx_asm_Pr_inout(ptr nocapture noundef writeonly %a, double noundef %b) nounwind {
+; CHECK-LABEL: zdinx_asm_Pr_inout:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    fabs.d a2, a2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    sw a2, 8(a0)
+; CHECK-NEXT:    sw a3, 12(a0)
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx = getelementptr inbounds double, ptr %a, i32 1
+  %0 = tail call double asm "fsgnjx.d $0, $1, $1", "=R,0"(double %b)
+  store double %0, ptr %arrayidx, align 8
+  ret void
+}
+
 define dso_local void @zfinx_asm(ptr nocapture noundef writeonly %a, float noundef %b, float noundef %c) nounwind {
 ; CHECK-LABEL: zfinx_asm:
 ; CHECK:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
index 8016b940b8d51..377440e1bbc93 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
@@ -896,17 +896,17 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v1_pred(<4 x float> %src1, <4 x float
 ; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
-; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmov.f32 s4, s3
-; CHECK-MVE-NEXT:    vmov.f32 s8, s2
-; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmla.f32 s4, s7, s11
-; CHECK-MVE-NEXT:    vmla.f32 s8, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s2
+; CHECK-MVE-NEXT:    vmov.f32 s12, s3
+; CHECK-MVE-NEXT:    vmla.f32 s14, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s10, s1
+; CHECK-MVE-NEXT:    vmla.f32 s12, s7, s11
+; CHECK-MVE-NEXT:    vmla.f32 s10, s5, s9
+; CHECK-MVE-NEXT:    vmov.f32 s9, s0
 ; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
+; CHECK-MVE-NEXT:    vmla.f32 s9, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, mi
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
@@ -914,13 +914,13 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v1_pred(<4 x float> %src1, <4 x float
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    cset r3, mi
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s12
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s8
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s14
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s9
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -949,17 +949,17 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v2_pred(<4 x float> %src1, <4 x float
 ; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
-; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmov.f32 s4, s3
-; CHECK-MVE-NEXT:    vmov.f32 s8, s2
-; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmla.f32 s4, s7, s11
-; CHECK-MVE-NEXT:    vmla.f32 s8, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s2
+; CHECK-MVE-NEXT:    vmov.f32 s12, s3
+; CHECK-MVE-NEXT:    vmla.f32 s14, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s10, s1
+; CHECK-MVE-NEXT:    vmla.f32 s12, s7, s11
+; CHECK-MVE-NEXT:    vmla.f32 s10, s5, s9
+; CHECK-MVE-NEXT:    vmov.f32 s9, s0
 ; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
+; CHECK-MVE-NEXT:    vmla.f32 s9, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, mi
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
@@ -967,13 +967,13 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v2_pred(<4 x float> %src1, <4 x float
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    cset r3, mi
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s12
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s8
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s14
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s9
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -1002,17 +1002,17 @@ define arm_aapcs_vfpcc <4 x float> @vfms32_pred(<4 x float> %src1, <4 x float> %
 ; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
-; CHECK-MVE-NEXT:    vmls.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmov.f32 s4, s3
-; CHECK-MVE-NEXT:    vmov.f32 s8, s2
-; CHECK-MVE-NEXT:    vmls.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmls.f32 s4, s7, s11
-; CHECK-MVE-NEXT:    vmls.f32 s8, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s2
+; CHECK-MVE-NEXT:    vmov.f32 s12, s3
+; CHECK-MVE-NEXT:    vmls.f32 s14, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s10, s1
+; CHECK-MVE-NEXT:    vmls.f32 s12, s7, s11
+; CHECK-MVE-NEXT:    vmls.f32 s10, s5, s9
+; CHECK-MVE-NEXT:    vmov.f32 s9, s0
 ; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
+; CHECK-MVE-NEXT:    vmls.f32 s9, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, mi
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
@@ -1020,13 +1020,13 @@ define arm_aapcs_vfpcc <4 x float> @vfms32_pred(<4 x float> %src1, <4 x float> %
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    cset r3, mi
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s12
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s8
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s14
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s9
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -1058,14 +1058,14 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s2
-; CHECK-MVE-NEXT:    vmov.f32 s10, s1
-; CHECK-MVE-NEXT:    vmla.f32 s12, s4, s8
-; CHECK-MVE-NEXT:    vmov.f32 s4, s3
-; CHECK-MVE-NEXT:    vmla.f32 s14, s6, s8
-; CHECK-MVE-NEXT:    vmla.f32 s10, s5, s8
-; CHECK-MVE-NEXT:    vmla.f32 s4, s7, s8
+; CHECK-MVE-NEXT:    vmov.f32 s10, s3
+; CHECK-MVE-NEXT:    vmov.f32 s12, s2
+; CHECK-MVE-NEXT:    vmov.f32 s14, s1
+; CHECK-MVE-NEXT:    vmov.f32 s9, s0
+; CHECK-MVE-NEXT:    vmla.f32 s10, s7, s8
+; CHECK-MVE-NEXT:    vmla.f32 s12, s6, s8
+; CHECK-MVE-NEXT:    vmla.f32 s14, s5, s8
+; CHECK-MVE-NEXT:    vmla.f32 s9, s4, s8
 ; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
@@ -1076,13 +1076,13 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    cset r3, mi
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s10
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s14
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s12
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s9
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %i = insertelement <4 x float> undef, float %src3, i32 0
@@ -1115,13 +1115,13 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    vmov.f32 s10, s8
-; CHECK-MVE-NEXT:    vmla.f32 s12, s0, s4
-; CHECK-MVE-NEXT:    vmov.f32 s4, s8
-; CHECK-MVE-NEXT:    vmla.f32 s8, s2, s6
-; CHECK-MVE-NEXT:    vmla.f32 s10, s1, s5
-; CHECK-MVE-NEXT:    vmla.f32 s4, s3, s7
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
+; CHECK-MVE-NEXT:    vmov.f32 s14, s8
+; CHECK-MVE-NEXT:    vmla.f32 s8, s0, s4
+; CHECK-MVE-NEXT:    vmla.f32 s10, s3, s7
+; CHECK-MVE-NEXT:    vmla.f32 s12, s2, s6
+; CHECK-MVE-NEXT:    vmla.f32 s14, s1, s5
 ; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
@@ -1132,13 +1132,13 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    cset r3, mi
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s10
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s8
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s12
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %i = insertelement <4 x float> undef, float %src3, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index 81b6a6940a7d6..75b6cb3e1272b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -42,54 +42,36 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f32_v2i32(<2 x float> %f) {
 ; CHECK-NEXT:    vldr s20, .LCPI1_1
 ; CHECK-NEXT:    vcmp.f32 s17, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r4, #-1
 ; CHECK-NEXT:    movlt.w r5, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    mvngt r5, #-2147483648
+; CHECK-NEXT:    movgt r4, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    movvs r5, #0
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcmp.f32 s16, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt.w r0, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r0, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s18
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r4, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s18
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt.w r1, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt r1, #0
+; CHECK-NEXT:    mvngt r0, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r4
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -1275,54 +1257,36 @@ define arm_aapcs_vfpcc <2 x i32> @test_signed_v2f16_v2i32(<2 x half> %f) {
 ; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r4, #-1
 ; CHECK-NEXT:    movlt.w r5, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    mvngt r5, #-2147483648
+; CHECK-NEXT:    movgt r4, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    movvs r5, #0
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt.w r0, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r0, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r4, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt.w r1, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt r1, #0
+; CHECK-NEXT:    mvngt r0, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r4
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -1568,9 +1532,9 @@ define arm_aapcs_vfpcc <4 x i8> @test_signed_v4f32_v4i8(<4 x float> %f) {
 ; CHECK-MVE-NEXT:    vmaxnm.f32 s12, s2, s4
 ; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s0, s4
 ; CHECK-MVE-NEXT:    vminnm.f32 s12, s12, s6
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s1, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s3, s4
 ; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s6
-; CHECK-MVE-NEXT:    vmaxnm.f32 s4, s3, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s4, s1, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s12, s12
 ; CHECK-MVE-NEXT:    vminnm.f32 s8, s8, s6
 ; CHECK-MVE-NEXT:    vminnm.f32 s4, s4, s6
@@ -1588,10 +1552,10 @@ define arm_aapcs_vfpcc <4 x i8> @test_signed_v4f32_v4i8(<4 x float> %f) {
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s4
+; CHECK-MVE-NEXT:    vmov r2, s8
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s1
 ; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-MVE-NEXT:    vmov r3, s8
+; CHECK-MVE-NEXT:    vmov r3, s4
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r2, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1627,9 +1591,9 @@ define arm_aapcs_vfpcc <4 x i13> @test_signed_v4f32_v4i13(<4 x float> %f) {
 ; CHECK-MVE-NEXT:    vmaxnm.f32 s12, s2, s4
 ; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s0, s4
 ; CHECK-MVE-NEXT:    vminnm.f32 s12, s12, s6
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s1, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s3, s4
 ; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s6
-; CHECK-MVE-NEXT:    vmaxnm.f32 s4, s3, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s4, s1, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s12, s12
 ; CHECK-MVE-NEXT:    vminnm.f32 s8, s8, s6
 ; CHECK-MVE-NEXT:    vminnm.f32 s4, s4, s6
@@ -1647,10 +1611,10 @@ define arm_aapcs_vfpcc <4 x i13> @test_signed_v4f32_v4i13(<4 x float> %f) {
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s4
+; CHECK-MVE-NEXT:    vmov r2, s8
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s1
 ; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-MVE-NEXT:    vmov r3, s8
+; CHECK-MVE-NEXT:    vmov r3, s4
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r2, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1686,9 +1650,9 @@ define arm_aapcs_vfpcc <4 x i16> @test_signed_v4f32_v4i16(<4 x float> %f) {
 ; CHECK-MVE-NEXT:    vmaxnm.f32 s12, s2, s4
 ; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s0, s4
 ; CHECK-MVE-NEXT:    vminnm.f32 s12, s12, s6
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s1, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s3, s4
 ; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s6
-; CHECK-MVE-NEXT:    vmaxnm.f32 s4, s3, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s4, s1, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s12, s12
 ; CHECK-MVE-NEXT:    vminnm.f32 s8, s8, s6
 ; CHECK-MVE-NEXT:    vminnm.f32 s4, s4, s6
@@ -1706,10 +1670,10 @@ define arm_aapcs_vfpcc <4 x i16> @test_signed_v4f32_v4i16(<4 x float> %f) {
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s4
+; CHECK-MVE-NEXT:    vmov r2, s8
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s1
 ; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-MVE-NEXT:    vmov r3, s8
+; CHECK-MVE-NEXT:    vmov r3, s4
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r2, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1743,9 +1707,9 @@ define arm_aapcs_vfpcc <4 x i19> @test_signed_v4f32_v4i19(<4 x float> %f) {
 ; CHECK-MVE-NEXT:    vmaxnm.f32 s12, s2, s4
 ; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s0, s4
 ; CHECK-MVE-NEXT:    vminnm.f32 s12, s12, s6
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s1, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s3, s4
 ; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s6
-; CHECK-MVE-NEXT:    vmaxnm.f32 s4, s3, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s4, s1, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s12, s12
 ; CHECK-MVE-NEXT:    vminnm.f32 s8, s8, s6
 ; CHECK-MVE-NEXT:    vminnm.f32 s4, s4, s6
@@ -1763,10 +1727,10 @@ define arm_aapcs_vfpcc <4 x i19> @test_signed_v4f32_v4i19(<4 x float> %f) {
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s4
+; CHECK-MVE-NEXT:    vmov r2, s8
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s1
 ; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-MVE-NEXT:    vmov r3, s8
+; CHECK-MVE-NEXT:    vmov r3, s4
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r2, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
@@ -1821,146 +1785,122 @@ define arm_aapcs_vfpcc <4 x i32> @test_signed_v4f32_v4i32_duplicate(<4 x float>
 define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i50:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r11, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vldr s20, .LCPI28_0
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov r6, s16
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    movtlt r5, #65534
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vldr s22, .LCPI28_1
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vldr s22, .LCPI28_0
+; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    vldr s20, .LCPI28_1
+; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    movtlt r7, #65534
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #1
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    mov r10, r1
+; CHECK-NEXT:    movwlt r11, #0
+; CHECK-NEXT:    movtlt r11, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt.w r9, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r7, #65535
-; CHECK-NEXT:    movtgt r7, #1
+; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r11, #65535
+; CHECK-NEXT:    movtgt r11, #1
+; CHECK-NEXT:    movgt.w r9, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs.w r9, #0
+; CHECK-NEXT:    movvs.w r11, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r8]
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vcmp.f32 s19, s20
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    ittt lt
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ittt lt
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movtlt r7, #65534
 ; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r7, #65535
+; CHECK-NEXT:    movtgt r7, #1
 ; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt vs
-; CHECK-NEXT:    movvs.w r9, #0
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    bfc r5, #18, #14
-; CHECK-NEXT:    mov r6, r9
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsll r4, r1, #22
-; CHECK-NEXT:    lsrl r6, r5, #28
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movwlt r10, #0
-; CHECK-NEXT:    movtlt r10, #65534
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    orrs r1, r5
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r10, #65535
-; CHECK-NEXT:    movtgt r10, #1
-; CHECK-NEXT:    str.w r1, [r8, #20]
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    orr.w r2, r6, r4
+; CHECK-NEXT:    vcmp.f32 s16, s22
+; CHECK-NEXT:    bfc r5, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    ittt lt
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movtlt r1, #65534
-; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    mov r2, r6
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #1
-; CHECK-NEXT:    str.w r2, [r8, #16]
-; CHECK-NEXT:    lsrs r2, r7, #10
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    strb.w r2, [r8, #24]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    lsrl r2, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    lsrl r2, r1, #14
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    orr.w r1, r1, r9, lsl #4
+; CHECK-NEXT:    str.w r0, [r8]
+; CHECK-NEXT:    lsrs r0, r7, #10
+; CHECK-NEXT:    bfc r7, #18, #14
+; CHECK-NEXT:    bfc r11, #18, #14
+; CHECK-NEXT:    lsll r4, r7, #22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strd r2, r1, [r8, #8]
+; CHECK-NEXT:    orr.w r3, r5, r7
+; CHECK-NEXT:    str.w r3, [r8, #20]
+; CHECK-NEXT:    orr.w r2, r2, r4
+; CHECK-NEXT:    str.w r2, [r8, #16]
+; CHECK-NEXT:    strb.w r0, [r8, #24]
+; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    lsrl r0, r11, #14
+; CHECK-NEXT:    orr.w r2, r11, r6, lsl #4
+; CHECK-NEXT:    strd r0, r2, [r8, #8]
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    bfc r10, #18, #14
-; CHECK-NEXT:    orr.w r0, r10, r0, lsl #18
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    orr.w r0, r1, r9, lsl #18
 ; CHECK-NEXT:    str.w r0, [r8, #4]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI28_0:
@@ -1974,131 +1914,98 @@ define arm_aapcs_vfpcc <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
 define arm_aapcs_vfpcc <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i64:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r11, r0
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vldr s22, .LCPI29_0
-; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    vldr s20, .LCPI29_1
-; CHECK-NEXT:    vmov r8, s16
+; CHECK-NEXT:    vmov r9, s17
 ; CHECK-NEXT:    vcmp.f32 s19, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r10, #-2147483648
+; CHECK-NEXT:    movlt.w r11, #0
 ; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    mvngt r10, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    vmov r8, s16
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    movvs.w r11, #0
 ; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r9, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r9, #0
+; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    movlt.w r6, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    mvngt r6, #-2147483648
+; CHECK-NEXT:    movgt.w r7, #-1
 ; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r7, #0
 ; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r4, #-2147483648
 ; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    mov r0, r8
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    mvngt r4, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    movvs r5, #0
 ; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    vmov q1[2], q1[0], r7, r10
+; CHECK-NEXT:    vmov q1[2], q1[0], r7, r11
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r4, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r4, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt.w r1, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
+; CHECK-NEXT:    vmov q1[3], q1[1], r6, r10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    mvngt r1, #-2147483648
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    vmov q1[3], q1[1], r6, r9
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r0, #0
 ; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r4
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI29_0:
@@ -2118,568 +2025,370 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    mov r10, r3
-; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vldr s22, .LCPI30_0
-; CHECK-NEXT:    vmov r7, s17
+; CHECK-NEXT:    mov r11, r1
 ; CHECK-NEXT:    vldr s20, .LCPI30_1
-; CHECK-NEXT:    vmov r4, s19
-; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    mov r10, r2
+; CHECK-NEXT:    vcmp.f32 s17, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    vcmp.f32 s17, s17
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r2, [r9, #33]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    str r7, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    mov r5, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #29]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    mvnlt r5, #7
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt r5, #7
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    str.w r2, [r4, #33]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    vmov r1, s19
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r9, #25]
-; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    str.w r0, [r4, #25]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    mov r11, r3
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vcmp.f32 s19, s22
+; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    mov r8, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    mvnlt r8, #7
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    movgt.w r9, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    movgt.w r8, #7
+; CHECK-NEXT:    vcmp.f32 s19, s19
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itttt vs
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    movvs.w r9, #0
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str.w r2, [r9, #8]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r9, #4]
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    mvnlt r3, #7
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    str r1, [r4, #4]
+; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r9]
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    lsrl r0, r9, #28
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    orr.w r1, r9, r6, lsl #4
+; CHECK-NEXT:    str.w r1, [r4, #45]
+; CHECK-NEXT:    and r1, r8, #15
+; CHECK-NEXT:    str.w r0, [r4, #41]
+; CHECK-NEXT:    and r0, r5, #15
+; CHECK-NEXT:    lsrl r6, r1, #28
+; CHECK-NEXT:    strb.w r6, [r4, #49]
+; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
+; CHECK-NEXT:    str.w r0, [r4, #37]
+; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    lsrl r0, r11, #28
+; CHECK-NEXT:    orr.w r1, r11, r10, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r4, #16]
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r10, r1, #28
+; CHECK-NEXT:    strb.w r10, [r4, #24]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    and r0, r3, #15
+; CHECK-NEXT:    orr.w r0, r0, r2, lsl #4
+; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI30_0:
+; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
+; CHECK-NEXT:  .LCPI30_1:
+; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
+    %x = call <4 x i100> @llvm.fptosi.sat.v4f32.v4i100(<4 x float> %f)
+    ret <4 x i100> %x
+}
+
+define arm_aapcs_vfpcc <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
+; CHECK-LABEL: test_signed_v4f32_v4i128:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vldr s22, .LCPI31_0
+; CHECK-NEXT:    vmov r7, s16
+; CHECK-NEXT:    vldr s20, .LCPI31_1
+; CHECK-NEXT:    vmov r6, s17
 ; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    strd r5, r1, [r4, #48]
+; CHECK-NEXT:    strd r2, r3, [r4, #56]
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    add.w r12, r4, #32
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r10, #7
 ; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #7
 ; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    mvngt r3, #-2147483648
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    and r0, r10, #15
-; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    orr.w r0, r0, r6, lsl #4
-; CHECK-NEXT:    str.w r0, [r9, #37]
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s17, s22
+; CHECK-NEXT:    add.w r12, r4, #16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
 ; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    mvngt r3, #-2147483648
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r11, #7
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #7
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r11, #0
-; CHECK-NEXT:    and r7, r11, #15
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
-; CHECK-NEXT:    str.w r7, [r9, #12]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r6, r5, #28
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    orr.w r7, r5, r4, lsl #4
-; CHECK-NEXT:    str.w r7, [r9, #45]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r6, [r9, #41]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r8, #7
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #7
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r8, #0
-; CHECK-NEXT:    and r5, r8, #15
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    lsrl r4, r5, #28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    strb.w r4, [r9, #49]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r0, r1, #28
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r3, #0
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    strd r0, r1, [r9, #16]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    and r1, r3, #15
-; CHECK-NEXT:    lsrl r2, r1, #28
-; CHECK-NEXT:    strb.w r2, [r9, #24]
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    stm r4!, {r0, r1, r2, r3}
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI30_0:
-; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
-; CHECK-NEXT:  .LCPI30_1:
-; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
-    %x = call <4 x i100> @llvm.fptosi.sat.v4f32.v4i100(<4 x float> %f)
-    ret <4 x i100> %x
+; CHECK-NEXT:  .LCPI31_0:
+; CHECK-NEXT:    .long 0xff000000 @ float -1.70141183E+38
+; CHECK-NEXT:  .LCPI31_1:
+; CHECK-NEXT:    .long 0x7effffff @ float 1.70141173E+38
+    %x = call <4 x i128> @llvm.fptosi.sat.v4f32.v4i128(<4 x float> %f)
+    ret <4 x i128> %x
 }
 
-define arm_aapcs_vfpcc <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
-; CHECK-LABEL: test_signed_v4f32_v4i128:
+;
+; 2-Vector double to signed integer -- result size variation
+;
+
+declare <2 x   i1> @llvm.fptosi.sat.v2f64.v2i1  (<2 x double>)
+declare <2 x   i8> @llvm.fptosi.sat.v2f64.v2i8  (<2 x double>)
+declare <2 x  i13> @llvm.fptosi.sat.v2f64.v2i13 (<2 x double>)
+declare <2 x  i16> @llvm.fptosi.sat.v2f64.v2i16 (<2 x double>)
+declare <2 x  i19> @llvm.fptosi.sat.v2f64.v2i19 (<2 x double>)
+declare <2 x  i50> @llvm.fptosi.sat.v2f64.v2i50 (<2 x double>)
+declare <2 x  i64> @llvm.fptosi.sat.v2f64.v2i64 (<2 x double>)
+declare <2 x i100> @llvm.fptosi.sat.v2f64.v2i100(<2 x double>)
+declare <2 x i128> @llvm.fptosi.sat.v2f64.v2i128(<2 x double>)
+
+define arm_aapcs_vfpcc <2 x i1> @test_signed_v2f64_v2i1(<2 x double> %f) {
+; CHECK-LABEL: test_signed_v2f64_v2i1:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vmov r5, s18
-; CHECK-NEXT:    vldr s22, .LCPI31_0
-; CHECK-NEXT:    vldr s20, .LCPI31_1
-; CHECK-NEXT:    vmov r7, s16
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str r3, [r4, #60]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str r2, [r4, #56]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #52]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #48]
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    vmov r6, s17
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str r3, [r4, #44]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str r2, [r4, #40]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #36]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #32]
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    str r3, [r4, #28]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    str r2, [r4, #24]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #20]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #16]
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r3, [r4, #12]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r2, [r4, #8]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI31_0:
-; CHECK-NEXT:    .long 0xff000000 @ float -1.70141183E+38
-; CHECK-NEXT:  .LCPI31_1:
-; CHECK-NEXT:    .long 0x7effffff @ float 1.70141173E+38
-    %x = call <4 x i128> @llvm.fptosi.sat.v4f32.v4i128(<4 x float> %f)
-    ret <4 x i128> %x
-}
-
-;
-; 2-Vector double to signed integer -- result size variation
-;
-
-declare <2 x   i1> @llvm.fptosi.sat.v2f64.v2i1  (<2 x double>)
-declare <2 x   i8> @llvm.fptosi.sat.v2f64.v2i8  (<2 x double>)
-declare <2 x  i13> @llvm.fptosi.sat.v2f64.v2i13 (<2 x double>)
-declare <2 x  i16> @llvm.fptosi.sat.v2f64.v2i16 (<2 x double>)
-declare <2 x  i19> @llvm.fptosi.sat.v2f64.v2i19 (<2 x double>)
-declare <2 x  i50> @llvm.fptosi.sat.v2f64.v2i50 (<2 x double>)
-declare <2 x  i64> @llvm.fptosi.sat.v2f64.v2i64 (<2 x double>)
-declare <2 x i100> @llvm.fptosi.sat.v2f64.v2i100(<2 x double>)
-declare <2 x i128> @llvm.fptosi.sat.v2f64.v2i128(<2 x double>)
-
-define arm_aapcs_vfpcc <2 x i1> @test_signed_v2f64_v2i1(<2 x double> %f) {
-; CHECK-LABEL: test_signed_v2f64_v2i1:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vldr d0, .LCPI32_0
-; CHECK-NEXT:    vmov r8, r7, d8
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    vmov r2, r3, d0
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    strd r2, r3, [sp, #12] @ 8-byte Folded Spill
-; CHECK-NEXT:    bl __aeabi_dcmpgt
-; CHECK-NEXT:    vldr d0, .LCPI32_1
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    vmov r2, r3, d0
-; CHECK-NEXT:    strd r2, r3, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    bl __aeabi_dcmpge
-; CHECK-NEXT:    mov r10, r0
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    bl __aeabi_d2iz
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    cmp.w r10, #0
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    moveq.w r11, #-1
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    mov r2, r8
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    cmp.w r9, #0
-; CHECK-NEXT:    vmov r6, r5, d9
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r11, #0
-; CHECK-NEXT:    bl __aeabi_dcmpun
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r11, #0
-; CHECK-NEXT:    and r0, r11, #1
-; CHECK-NEXT:    ldrd r2, r3, [sp, #12] @ 8-byte Folded Reload
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    bfi r4, r0, #0, #1
+; CHECK-NEXT:    vldr d0, .LCPI32_0
+; CHECK-NEXT:    vmov r8, r7, d8
+; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    strd r2, r3, [sp, #12] @ 8-byte Folded Spill
+; CHECK-NEXT:    bl __aeabi_dcmpgt
+; CHECK-NEXT:    vldr d0, .LCPI32_1
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    strd r2, r3, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT:    bl __aeabi_dcmpge
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    bl __aeabi_d2iz
+; CHECK-NEXT:    mov r11, r0
+; CHECK-NEXT:    cmp.w r10, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    moveq.w r11, #-1
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r8
+; CHECK-NEXT:    mov r3, r7
+; CHECK-NEXT:    cmp.w r9, #0
+; CHECK-NEXT:    vmov r6, r5, d9
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r11, #0
+; CHECK-NEXT:    bl __aeabi_dcmpun
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    movne.w r11, #0
+; CHECK-NEXT:    and r0, r11, #1
+; CHECK-NEXT:    ldrd r2, r3, [sp, #12] @ 8-byte Folded Reload
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    bfi r4, r0, #0, #1
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    mov r1, r5
 ; CHECK-NEXT:    bl __aeabi_dcmpgt
@@ -4244,33 +3953,37 @@ define arm_aapcs_vfpcc <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-NEXT:    .vsave {d8}
 ; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    vcvtb.f32.f16 s15, s0
-; CHECK-NEXT:    vmov.f32 s5, #-1.000000e+00
-; CHECK-NEXT:    vldr s7, .LCPI42_0
-; CHECK-NEXT:    vmaxnm.f32 s16, s15, s5
+; CHECK-NEXT:    vmov.f32 s7, #-1.000000e+00
+; CHECK-NEXT:    vldr s5, .LCPI42_0
+; CHECK-NEXT:    vmaxnm.f32 s16, s15, s7
 ; CHECK-NEXT:    vcvtt.f32.f16 s12, s2
 ; CHECK-NEXT:    vcvtt.f32.f16 s9, s1
-; CHECK-NEXT:    vminnm.f32 s16, s16, s7
+; CHECK-NEXT:    vminnm.f32 s16, s16, s5
 ; CHECK-NEXT:    vcvtt.f32.f16 s4, s3
 ; CHECK-NEXT:    vcvt.s32.f32 s16, s16
 ; CHECK-NEXT:    vcvtb.f32.f16 s8, s3
 ; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
 ; CHECK-NEXT:    vcvtb.f32.f16 s1, s1
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s0
-; CHECK-NEXT:    vmaxnm.f32 s6, s4, s5
-; CHECK-NEXT:    vmaxnm.f32 s10, s8, s5
-; CHECK-NEXT:    vmaxnm.f32 s14, s12, s5
-; CHECK-NEXT:    vmaxnm.f32 s3, s2, s5
-; CHECK-NEXT:    vmaxnm.f32 s11, s9, s5
-; CHECK-NEXT:    vmaxnm.f32 s13, s1, s5
-; CHECK-NEXT:    vmaxnm.f32 s5, s0, s5
-; CHECK-NEXT:    vminnm.f32 s5, s5, s7
-; CHECK-NEXT:    vminnm.f32 s13, s13, s7
-; CHECK-NEXT:    vcvt.s32.f32 s5, s5
-; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmaxnm.f32 s6, s4, s7
+; CHECK-NEXT:    vmaxnm.f32 s10, s8, s7
+; CHECK-NEXT:    vmaxnm.f32 s14, s12, s7
+; CHECK-NEXT:    vmaxnm.f32 s3, s2, s7
+; CHECK-NEXT:    vmaxnm.f32 s11, s9, s7
+; CHECK-NEXT:    vmaxnm.f32 s13, s1, s7
+; CHECK-NEXT:    vmaxnm.f32 s7, s0, s7
+; CHECK-NEXT:    vminnm.f32 s6, s6, s5
+; CHECK-NEXT:    vminnm.f32 s10, s10, s5
+; CHECK-NEXT:    vminnm.f32 s14, s14, s5
+; CHECK-NEXT:    vminnm.f32 s3, s3, s5
+; CHECK-NEXT:    vminnm.f32 s11, s11, s5
+; CHECK-NEXT:    vminnm.f32 s13, s13, s5
+; CHECK-NEXT:    vminnm.f32 s5, s7, s5
 ; CHECK-NEXT:    vcmp.f32 s15, s15
-; CHECK-NEXT:    vminnm.f32 s11, s11, s7
-; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vcvt.s32.f32 s5, s5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    mov.w r1, #0
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vcvt.s32.f32 s13, s13
@@ -4280,60 +3993,56 @@ define arm_aapcs_vfpcc <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    bfi r1, r2, #0, #1
 ; CHECK-NEXT:    vcvt.s32.f32 s11, s11
+; CHECK-NEXT:    vcmp.f32 s1, s1
 ; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vminnm.f32 s3, s3, s7
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vcmp.f32 s1, s1
-; CHECK-NEXT:    and r2, r2, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    and r2, r2, #1
 ; CHECK-NEXT:    vcvt.s32.f32 s3, s3
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    vcmp.f32 s9, s9
 ; CHECK-NEXT:    bfi r1, r2, #1, #1
 ; CHECK-NEXT:    vmov r2, s13
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vminnm.f32 s14, s14, s7
-; CHECK-NEXT:    and r2, r2, #1
-; CHECK-NEXT:    vcmp.f32 s9, s9
-; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    and r2, r2, #1
+; CHECK-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    vcmp.f32 s2, s2
 ; CHECK-NEXT:    bfi r1, r2, #2, #1
 ; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    and r2, r2, #1
-; CHECK-NEXT:    vminnm.f32 s10, s10, s7
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vcmp.f32 s2, s2
+; CHECK-NEXT:    vcvt.s32.f32 s10, s10
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    vcmp.f32 s12, s12
 ; CHECK-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vcvt.s32.f32 s10, s10
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    and r2, r2, #1
-; CHECK-NEXT:    rsbs r2, r2, #0
-; CHECK-NEXT:    vminnm.f32 s6, s6, s7
+; CHECK-NEXT:    vcvt.s32.f32 s6, s6
+; CHECK-NEXT:    rsb.w r2, r2, #0
+; CHECK-NEXT:    vcmp.f32 s8, s8
 ; CHECK-NEXT:    bfi r1, r2, #4, #1
-; CHECK-NEXT:    vcmp.f32 s12, s12
 ; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vcvt.s32.f32 s6, s6
-; CHECK-NEXT:    and r2, r2, #1
-; CHECK-NEXT:    vcmp.f32 s8, s8
-; CHECK-NEXT:    rsbs r2, r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    and r2, r2, #1
+; CHECK-NEXT:    vcmp.f32 s4, s4
+; CHECK-NEXT:    rsb.w r2, r2, #0
 ; CHECK-NEXT:    bfi r1, r2, #5, #1
 ; CHECK-NEXT:    vmov r2, s10
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vcmp.f32 s4, s4
-; CHECK-NEXT:    and r2, r2, #1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    and r2, r2, #1
 ; CHECK-NEXT:    rsb.w r2, r2, #0
 ; CHECK-NEXT:    bfi r1, r2, #6, #1
 ; CHECK-NEXT:    vmov r2, s6
@@ -4360,40 +4069,40 @@ define arm_aapcs_vfpcc <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-MVE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-MVE-NEXT:    .vsave {d8}
 ; CHECK-MVE-NEXT:    vpush {d8}
-; CHECK-MVE-NEXT:    vldr s8, .LCPI43_1
+; CHECK-MVE-NEXT:    vldr s6, .LCPI43_1
 ; CHECK-MVE-NEXT:    vcvtt.f32.f16 s13, s3
 ; CHECK-MVE-NEXT:    vcvtb.f32.f16 s3, s3
-; CHECK-MVE-NEXT:    vldr s6, .LCPI43_0
-; CHECK-MVE-NEXT:    vmaxnm.f32 s16, s3, s8
-; CHECK-MVE-NEXT:    vcvtt.f32.f16 s4, s0
-; CHECK-MVE-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-MVE-NEXT:    vldr s4, .LCPI43_0
+; CHECK-MVE-NEXT:    vmaxnm.f32 s16, s3, s6
 ; CHECK-MVE-NEXT:    vcvtt.f32.f16 s7, s2
-; CHECK-MVE-NEXT:    vmaxnm.f32 s15, s13, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s16, s16, s6
-; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-MVE-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-MVE-NEXT:    vmaxnm.f32 s15, s13, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s16, s16, s4
 ; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s4, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s14, s12, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s5, s0, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s9, s7, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s11, s1, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s15, s15, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s15, s15, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s11, s2, s6
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s16, s16
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s2, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s14, s14, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s5, s5, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s9, s9, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s11, s11, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s6, s8, s6
+; CHECK-MVE-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-MVE-NEXT:    vmaxnm.f32 s9, s7, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s11, s11, s4
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s1, s1
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s15, s15
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s6
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s9, s9
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s8, s0
+; CHECK-MVE-NEXT:    vmaxnm.f32 s5, s1, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s9, s9, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s11, s11
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s8, s6
+; CHECK-MVE-NEXT:    vmaxnm.f32 s14, s12, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s5, s5, s4
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s9, s9
+; CHECK-MVE-NEXT:    vcvtt.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s4
+; CHECK-MVE-NEXT:    vminnm.f32 s14, s14, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s5, s5
+; CHECK-MVE-NEXT:    vmaxnm.f32 s6, s0, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s4, s6, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s10
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s4
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, s3
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vmov r12, s16
@@ -4406,7 +4115,7 @@ define arm_aapcs_vfpcc <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs.w lr, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
+; CHECK-MVE-NEXT:    vmov r2, s11
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, s7
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r2, #0
@@ -4416,22 +4125,22 @@ define arm_aapcs_vfpcc <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r0, s11
+; CHECK-MVE-NEXT:    vmov r0, s5
 ; CHECK-MVE-NEXT:    vcmp.f32 s12, s12
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vmov r1, s14
-; CHECK-MVE-NEXT:    vmov r4, s5
+; CHECK-MVE-NEXT:    vmov r4, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #0
-; CHECK-MVE-NEXT:    vcmp.f32 s0, s0
+; CHECK-MVE-NEXT:    vcmp.f32 s8, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r4, #0
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s4
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s0
 ; CHECK-MVE-NEXT:    vmov.16 q0[0], r4
-; CHECK-MVE-NEXT:    vmov r5, s10
+; CHECK-MVE-NEXT:    vmov r5, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r5, #0
@@ -4468,40 +4177,40 @@ define arm_aapcs_vfpcc <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-MVE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-MVE-NEXT:    .vsave {d8}
 ; CHECK-MVE-NEXT:    vpush {d8}
-; CHECK-MVE-NEXT:    vldr s8, .LCPI44_1
+; CHECK-MVE-NEXT:    vldr s6, .LCPI44_1
 ; CHECK-MVE-NEXT:    vcvtt.f32.f16 s13, s3
 ; CHECK-MVE-NEXT:    vcvtb.f32.f16 s3, s3
-; CHECK-MVE-NEXT:    vldr s6, .LCPI44_0
-; CHECK-MVE-NEXT:    vmaxnm.f32 s16, s3, s8
-; CHECK-MVE-NEXT:    vcvtt.f32.f16 s4, s0
-; CHECK-MVE-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-MVE-NEXT:    vldr s4, .LCPI44_0
+; CHECK-MVE-NEXT:    vmaxnm.f32 s16, s3, s6
 ; CHECK-MVE-NEXT:    vcvtt.f32.f16 s7, s2
-; CHECK-MVE-NEXT:    vmaxnm.f32 s15, s13, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s16, s16, s6
-; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-MVE-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-MVE-NEXT:    vmaxnm.f32 s15, s13, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s16, s16, s4
 ; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s4, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s14, s12, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s5, s0, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s9, s7, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s11, s1, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s15, s15, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s15, s15, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s11, s2, s6
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s16, s16
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s2, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s14, s14, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s5, s5, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s9, s9, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s11, s11, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s6, s8, s6
+; CHECK-MVE-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-MVE-NEXT:    vmaxnm.f32 s9, s7, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s11, s11, s4
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s1, s1
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s15, s15
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s6
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s9, s9
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s8, s0
+; CHECK-MVE-NEXT:    vmaxnm.f32 s5, s1, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s9, s9, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s11, s11
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s8, s6
+; CHECK-MVE-NEXT:    vmaxnm.f32 s14, s12, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s5, s5, s4
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s9, s9
+; CHECK-MVE-NEXT:    vcvtt.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s4
+; CHECK-MVE-NEXT:    vminnm.f32 s14, s14, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s5, s5
+; CHECK-MVE-NEXT:    vmaxnm.f32 s6, s0, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s4, s6, s4
 ; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s10
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s4
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, s3
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vmov r12, s16
@@ -4514,7 +4223,7 @@ define arm_aapcs_vfpcc <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs.w lr, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
+; CHECK-MVE-NEXT:    vmov r2, s11
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, s7
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r2, #0
@@ -4524,22 +4233,22 @@ define arm_aapcs_vfpcc <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r0, s11
+; CHECK-MVE-NEXT:    vmov r0, s5
 ; CHECK-MVE-NEXT:    vcmp.f32 s12, s12
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vmov r1, s14
-; CHECK-MVE-NEXT:    vmov r4, s5
+; CHECK-MVE-NEXT:    vmov r4, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #0
-; CHECK-MVE-NEXT:    vcmp.f32 s0, s0
+; CHECK-MVE-NEXT:    vcmp.f32 s8, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r4, #0
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s4
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s0
 ; CHECK-MVE-NEXT:    vmov.16 q0[0], r4
-; CHECK-MVE-NEXT:    vmov r5, s10
+; CHECK-MVE-NEXT:    vmov r5, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r5, #0
@@ -4578,838 +4287,273 @@ define arm_aapcs_vfpcc <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) {
 ; CHECK-MVE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-MVE-NEXT:    .vsave {d8}
 ; CHECK-MVE-NEXT:    vpush {d8}
-; CHECK-MVE-NEXT:    vldr s8, .LCPI45_1
+; CHECK-MVE-NEXT:    vldr s6, .LCPI45_1
 ; CHECK-MVE-NEXT:    vcvtt.f32.f16 s13, s3
-; CHECK-MVE-NEXT:    vcvtb.f32.f16 s3, s3
-; CHECK-MVE-NEXT:    vldr s6, .LCPI45_0
-; CHECK-MVE-NEXT:    vmaxnm.f32 s16, s3, s8
-; CHECK-MVE-NEXT:    vcvtt.f32.f16 s4, s0
-; CHECK-MVE-NEXT:    vcvtt.f32.f16 s12, s1
-; CHECK-MVE-NEXT:    vcvtt.f32.f16 s7, s2
-; CHECK-MVE-NEXT:    vmaxnm.f32 s15, s13, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s16, s16, s6
-; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-MVE-NEXT:    vcvtb.f32.f16 s1, s1
-; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s4, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s14, s12, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s5, s0, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s9, s7, s8
-; CHECK-MVE-NEXT:    vmaxnm.f32 s11, s1, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s15, s15, s6
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s16, s16
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s2, s8
-; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s14, s14, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s5, s5, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s9, s9, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s11, s11, s6
-; CHECK-MVE-NEXT:    vminnm.f32 s6, s8, s6
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s15, s15
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s6
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s9, s9
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s11, s11
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s14, s14
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s5, s5
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s10
-; CHECK-MVE-NEXT:    vcmp.f32 s3, s3
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r12, s16
-; CHECK-MVE-NEXT:    vcmp.f32 s13, s13
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs.w r12, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov lr, s15
-; CHECK-MVE-NEXT:    vcmp.f32 s2, s2
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs.w lr, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vcmp.f32 s7, s7
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r3, s9
-; CHECK-MVE-NEXT:    vcmp.f32 s1, s1
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r3, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r0, s11
-; CHECK-MVE-NEXT:    vcmp.f32 s12, s12
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r0, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s14
-; CHECK-MVE-NEXT:    vmov r4, s5
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r1, #0
-; CHECK-MVE-NEXT:    vcmp.f32 s0, s0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r4, #0
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s4
-; CHECK-MVE-NEXT:    vmov.16 q0[0], r4
-; CHECK-MVE-NEXT:    vmov r5, s10
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r5, #0
-; CHECK-MVE-NEXT:    vmov.16 q0[1], r5
-; CHECK-MVE-NEXT:    vmov.16 q0[2], r0
-; CHECK-MVE-NEXT:    vmov.16 q0[3], r1
-; CHECK-MVE-NEXT:    vmov.16 q0[4], r2
-; CHECK-MVE-NEXT:    vmov.16 q0[5], r3
-; CHECK-MVE-NEXT:    vmov.16 q0[6], r12
-; CHECK-MVE-NEXT:    vmov.16 q0[7], lr
-; CHECK-MVE-NEXT:    vpop {d8}
-; CHECK-MVE-NEXT:    pop {r4, r5, r7, pc}
-; CHECK-MVE-NEXT:    .p2align 2
-; CHECK-MVE-NEXT:  @ %bb.1:
-; CHECK-MVE-NEXT:  .LCPI45_0:
-; CHECK-MVE-NEXT:    .long 0x46fffe00 @ float 32767
-; CHECK-MVE-NEXT:  .LCPI45_1:
-; CHECK-MVE-NEXT:    .long 0xc7000000 @ float -32768
-;
-; CHECK-MVEFP-LABEL: test_signed_v8f16_v8i16:
-; CHECK-MVEFP:       @ %bb.0:
-; CHECK-MVEFP-NEXT:    vcvt.s16.f16 q0, q0
-; CHECK-MVEFP-NEXT:    bx lr
-    %x = call <8 x i16> @llvm.fptosi.sat.v8f16.v8i16(<8 x half> %f)
-    ret <8 x i16> %x
-}
-
-define arm_aapcs_vfpcc <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) {
-; CHECK-LABEL: test_signed_v8f16_v8i19:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r7, r9, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r7, r9, r11, lr}
-; CHECK-NEXT:    vldr s4, .LCPI46_0
-; CHECK-NEXT:    vcvtb.f32.f16 s8, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s12, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
-; CHECK-NEXT:    vldr s6, .LCPI46_1
-; CHECK-NEXT:    vmaxnm.f32 s5, s1, s4
-; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-NEXT:    vmaxnm.f32 s14, s12, s4
-; CHECK-NEXT:    vminnm.f32 s5, s5, s6
-; CHECK-NEXT:    vmaxnm.f32 s7, s0, s4
-; CHECK-NEXT:    vminnm.f32 s7, s7, s6
-; CHECK-NEXT:    vcvt.s32.f32 s5, s5
-; CHECK-NEXT:    vcvt.s32.f32 s7, s7
-; CHECK-NEXT:    vminnm.f32 s14, s14, s6
-; CHECK-NEXT:    vcvt.s32.f32 s14, s14
-; CHECK-NEXT:    vmaxnm.f32 s10, s8, s4
-; CHECK-NEXT:    vminnm.f32 s10, s10, s6
-; CHECK-NEXT:    vcmp.f32 s1, s1
-; CHECK-NEXT:    vcvt.s32.f32 s10, s10
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s0, s0
-; CHECK-NEXT:    mov.w r7, #0
-; CHECK-NEXT:    vcvtb.f32.f16 s0, s2
-; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    mov.w r5, #0
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    bfc r2, #19, #13
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vcmp.f32 s12, s12
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s8, s8
-; CHECK-NEXT:    lsll r2, r7, #19
-; CHECK-NEXT:    bfc r1, #19, #13
-; CHECK-NEXT:    vmov r12, s14
-; CHECK-NEXT:    vmaxnm.f32 s8, s0, s4
-; CHECK-NEXT:    orr.w r1, r1, r2
-; CHECK-NEXT:    str r1, [r0]
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r12, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s0, s0
-; CHECK-NEXT:    vcvtt.f32.f16 s0, s2
-; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
-; CHECK-NEXT:    vminnm.f32 s8, s8, s6
-; CHECK-NEXT:    vminnm.f32 s2, s2, s6
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vcvt.s32.f32 s2, s2
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vcvt.s32.f32 s8, s8
-; CHECK-NEXT:    bfc r3, #19, #13
-; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    bfc r2, #19, #13
-; CHECK-NEXT:    mov r4, r3
-; CHECK-NEXT:    lsrl r2, r1, #7
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s0, s0
-; CHECK-NEXT:    lsrl r4, r9, #26
-; CHECK-NEXT:    vcvtt.f32.f16 s0, s3
-; CHECK-NEXT:    mov lr, r1
-; CHECK-NEXT:    orr.w r1, r4, r2
-; CHECK-NEXT:    vmov r4, s2
-; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vminnm.f32 s2, s2, s6
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vcvt.s32.f32 s2, s2
-; CHECK-NEXT:    bfc r2, #19, #13
-; CHECK-NEXT:    lsll r2, r5, #12
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    orrs r2, r1
-; CHECK-NEXT:    bfc r4, #19, #13
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    lsll r4, r1, #31
-; CHECK-NEXT:    vcmp.f32 s0, s0
-; CHECK-NEXT:    orrs r2, r4
-; CHECK-NEXT:    str r2, [r0, #8]
-; CHECK-NEXT:    orr.w r2, r7, r3, lsl #6
-; CHECK-NEXT:    vcvtb.f32.f16 s0, s3
-; CHECK-NEXT:    orr.w r3, r2, r12, lsl #25
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmaxnm.f32 s2, s0, s4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vminnm.f32 s2, s2, s6
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vcvt.s32.f32 s2, s2
-; CHECK-NEXT:    bfc r2, #19, #13
-; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    vcmp.f32 s0, s0
-; CHECK-NEXT:    lsll r2, r7, #5
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov.w r11, #0
-; CHECK-NEXT:    vmov r7, s2
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:    bfc r4, #19, #13
-; CHECK-NEXT:    lsrl r4, r11, #14
-; CHECK-NEXT:    orrs r2, r4
-; CHECK-NEXT:    strh r2, [r0, #16]
-; CHECK-NEXT:    str r3, [r0, #4]
-; CHECK-NEXT:    lsrs r2, r2, #16
-; CHECK-NEXT:    strb r2, [r0, #18]
-; CHECK-NEXT:    orr.w r2, r9, lr
-; CHECK-NEXT:    orrs r2, r5
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    orr.w r1, r1, r7, lsl #18
-; CHECK-NEXT:    str r1, [r0, #12]
-; CHECK-NEXT:    pop.w {r4, r5, r7, r9, r11, pc}
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI46_0:
-; CHECK-NEXT:    .long 0xc8800000 @ float -262144
-; CHECK-NEXT:  .LCPI46_1:
-; CHECK-NEXT:    .long 0x487fffc0 @ float 262143
-    %x = call <8 x i19> @llvm.fptosi.sat.v8f16.v8i19(<8 x half> %f)
-    ret <8 x i19> %x
-}
-
-define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32_duplicate(<8 x half> %f) {
-; CHECK-LABEL: test_signed_v8f16_v8i32_duplicate:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vcvt.s32.f16 s8, s4
-; CHECK-NEXT:    vmovx.f16 s4, s2
-; CHECK-NEXT:    vcvt.s32.f16 s10, s4
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vcvt.s32.f16 s14, s2
-; CHECK-NEXT:    vcvt.s32.f16 s2, s1
-; CHECK-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-NEXT:    vcvt.s32.f16 s4, s4
-; CHECK-NEXT:    vcvt.s32.f16 s6, s6
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vcvt.s32.f16 s12, s3
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    bx lr
-    %x = call <8 x i32> @llvm.fptosi.sat.v8f16.v8i32(<8 x half> %f)
-    ret <8 x i32> %x
-}
-
-define arm_aapcs_vfpcc <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) {
-; CHECK-LABEL: test_signed_v8f16_v8i50:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s19
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s18
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vldr s24, .LCPI48_1
-; CHECK-NEXT:    vcvtb.f32.f16 s20, s16
-; CHECK-NEXT:    vcvtb.f32.f16 s28, s19
-; CHECK-NEXT:    vcmp.f32 s30, s24
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vmov r7, s20
-; CHECK-NEXT:    vldr s22, .LCPI48_0
-; CHECK-NEXT:    vmov r6, s28
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    movtlt r5, #65534
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s26, s24
-; CHECK-NEXT:    mov r10, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s22
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s26, s26
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r9, #25]
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s20, s24
-; CHECK-NEXT:    mov r8, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s22
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s20
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s24
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r9]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    vcmp.f32 s30, s22
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt vs
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    mov r7, r5
-; CHECK-NEXT:    bfc r7, #18, #14
-; CHECK-NEXT:    lsll r4, r7, #22
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s28, s24
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s24
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    movtlt r1, #65534
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r1, #65535
-; CHECK-NEXT:    movtgt r1, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    mov r2, r6
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s18
-; CHECK-NEXT:    lsrl r2, r1, #28
-; CHECK-NEXT:    orr.w r0, r1, r7
-; CHECK-NEXT:    str.w r0, [r9, #45]
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    orrs r4, r2
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s28, s24
-; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    movtlt r1, #65534
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
-; CHECK-NEXT:    lsrs r0, r5, #10
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r1, #65535
-; CHECK-NEXT:    movtgt r1, #1
-; CHECK-NEXT:    str.w r4, [r9, #41]
-; CHECK-NEXT:    strb.w r0, [r9, #49]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vcmp.f32 s28, s24
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:    lsrl r4, r1, #14
-; CHECK-NEXT:    orr.w r6, r1, r6, lsl #4
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vcmp.f32 s18, s24
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    movtlt r5, #65534
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #1
-; CHECK-NEXT:    str.w r6, [r9, #37]
-; CHECK-NEXT:    str.w r4, [r9, #33]
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s28, s24
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    movtlt r4, #65534
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s24
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r4, #65535
-; CHECK-NEXT:    movtgt r4, #1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movwlt r10, #0
-; CHECK-NEXT:    movtlt r10, #65534
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r10, #65535
-; CHECK-NEXT:    movtgt r10, #1
-; CHECK-NEXT:    vcmp.f32 s26, s26
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    bfc r10, #18, #14
-; CHECK-NEXT:    vcmp.f32 s28, s24
-; CHECK-NEXT:    orr.w r0, r10, r7, lsl #18
-; CHECK-NEXT:    str.w r0, [r9, #29]
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s24
-; CHECK-NEXT:    itt vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    movvs r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    mov r1, r4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #-1
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt vs
-; CHECK-NEXT:    movvs.w r11, #0
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    vcmp.f32 s20, s24
-; CHECK-NEXT:    bfc r5, #18, #14
-; CHECK-NEXT:    mov r10, r11
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsll r6, r1, #22
-; CHECK-NEXT:    lsrl r10, r5, #28
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movwlt r8, #0
-; CHECK-NEXT:    movtlt r8, #65534
-; CHECK-NEXT:    vcmp.f32 s20, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r8, #65535
-; CHECK-NEXT:    movtgt r8, #1
-; CHECK-NEXT:    orrs r1, r5
-; CHECK-NEXT:    str.w r1, [r9, #20]
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s16, s24
-; CHECK-NEXT:    orr.w r2, r10, r6
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    movtlt r1, #65534
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r1, #65535
-; CHECK-NEXT:    movtgt r1, #1
-; CHECK-NEXT:    str.w r2, [r9, #16]
-; CHECK-NEXT:    lsrs r2, r4, #10
-; CHECK-NEXT:    vcmp.f32 s16, s24
-; CHECK-NEXT:    strb.w r2, [r9, #24]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s16, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    lsrl r2, r1, #14
-; CHECK-NEXT:    vcmp.f32 s20, s20
-; CHECK-NEXT:    orr.w r1, r1, r11, lsl #4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strd r2, r1, [r9, #8]
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r8, #0
-; CHECK-NEXT:    bfc r8, #18, #14
-; CHECK-NEXT:    orr.w r0, r8, r0, lsl #18
-; CHECK-NEXT:    str.w r0, [r9, #4]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI48_0:
-; CHECK-NEXT:    .long 0x57ffffff @ float 5.6294992E+14
-; CHECK-NEXT:  .LCPI48_1:
-; CHECK-NEXT:    .long 0xd8000000 @ float -5.62949953E+14
-    %x = call <8 x i50> @llvm.fptosi.sat.v8f16.v8i50(<8 x half> %f)
-    ret <8 x i50> %x
-}
-
-define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
-; CHECK-LABEL: test_signed_v8f16_v8i64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vcvtt.f32.f16 s20, s19
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtb.f32.f16 s22, s19
-; CHECK-NEXT:    mov r9, r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vldr s30, .LCPI49_1
-; CHECK-NEXT:    vldr s28, .LCPI49_0
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s16
-; CHECK-NEXT:    vcmp.f32 s20, s30
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s20, s28
-; CHECK-NEXT:    mov r8, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    vcmp.f32 s20, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r9, #0
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s22, s30
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s22
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s30
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r11, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s20
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r8, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r10, r1
-; CHECK-NEXT:    vcmp.f32 s22, s30
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r8, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #-2147483648
-; CHECK-NEXT:    vcmp.f32 s22, s28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r10, #-2147483648
-; CHECK-NEXT:    vcmp.f32 s22, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r10, #0
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vcmp.f32 s16, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtt.f32.f16 s19, s17
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r1, s19
-; CHECK-NEXT:    vcmp.f32 s24, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s24, s28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s24, s24
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    vmov q5[2], q5[0], r0, r6
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtb.f32.f16 s17, s17
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    vcmp.f32 s19, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s3, s3
+; CHECK-MVE-NEXT:    vldr s4, .LCPI45_0
+; CHECK-MVE-NEXT:    vmaxnm.f32 s16, s3, s6
+; CHECK-MVE-NEXT:    vcvtt.f32.f16 s7, s2
+; CHECK-MVE-NEXT:    vmaxnm.f32 s15, s13, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s16, s16, s4
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vminnm.f32 s15, s15, s4
+; CHECK-MVE-NEXT:    vmaxnm.f32 s11, s2, s6
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s16, s16
+; CHECK-MVE-NEXT:    vcvtt.f32.f16 s12, s1
+; CHECK-MVE-NEXT:    vmaxnm.f32 s9, s7, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s11, s11, s4
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s15, s15
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s8, s0
+; CHECK-MVE-NEXT:    vmaxnm.f32 s5, s1, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s9, s9, s4
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s11, s11
+; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s8, s6
+; CHECK-MVE-NEXT:    vmaxnm.f32 s14, s12, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s5, s5, s4
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s9, s9
+; CHECK-MVE-NEXT:    vcvtt.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vminnm.f32 s10, s10, s4
+; CHECK-MVE-NEXT:    vminnm.f32 s14, s14, s4
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s5, s5
+; CHECK-MVE-NEXT:    vmaxnm.f32 s6, s0, s6
+; CHECK-MVE-NEXT:    vminnm.f32 s4, s6, s4
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s10
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s4
+; CHECK-MVE-NEXT:    vcmp.f32 s3, s3
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r12, s16
+; CHECK-MVE-NEXT:    vcmp.f32 s13, s13
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs.w r12, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov lr, s15
+; CHECK-MVE-NEXT:    vcmp.f32 s2, s2
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs.w lr, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s11
+; CHECK-MVE-NEXT:    vcmp.f32 s7, s7
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r3, s9
+; CHECK-MVE-NEXT:    vcmp.f32 s1, s1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r3, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s5
+; CHECK-MVE-NEXT:    vcmp.f32 s12, s12
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s14
+; CHECK-MVE-NEXT:    vmov r4, s10
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s8, s8
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r4, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov.16 q0[0], r4
+; CHECK-MVE-NEXT:    vmov r5, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r5, #0
+; CHECK-MVE-NEXT:    vmov.16 q0[1], r5
+; CHECK-MVE-NEXT:    vmov.16 q0[2], r0
+; CHECK-MVE-NEXT:    vmov.16 q0[3], r1
+; CHECK-MVE-NEXT:    vmov.16 q0[4], r2
+; CHECK-MVE-NEXT:    vmov.16 q0[5], r3
+; CHECK-MVE-NEXT:    vmov.16 q0[6], r12
+; CHECK-MVE-NEXT:    vmov.16 q0[7], lr
+; CHECK-MVE-NEXT:    vpop {d8}
+; CHECK-MVE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-MVE-NEXT:    .p2align 2
+; CHECK-MVE-NEXT:  @ %bb.1:
+; CHECK-MVE-NEXT:  .LCPI45_0:
+; CHECK-MVE-NEXT:    .long 0x46fffe00 @ float 32767
+; CHECK-MVE-NEXT:  .LCPI45_1:
+; CHECK-MVE-NEXT:    .long 0xc7000000 @ float -32768
+;
+; CHECK-MVEFP-LABEL: test_signed_v8f16_v8i16:
+; CHECK-MVEFP:       @ %bb.0:
+; CHECK-MVEFP-NEXT:    vcvt.s16.f16 q0, q0
+; CHECK-MVEFP-NEXT:    bx lr
+    %x = call <8 x i16> @llvm.fptosi.sat.v8f16.v8i16(<8 x half> %f)
+    ret <8 x i16> %x
+}
+
+define arm_aapcs_vfpcc <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) {
+; CHECK-LABEL: test_signed_v8f16_v8i19:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r7, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r7, r9, r11, lr}
+; CHECK-NEXT:    vldr s6, .LCPI46_1
+; CHECK-NEXT:    vcvtb.f32.f16 s12, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s0
+; CHECK-NEXT:    vldr s4, .LCPI46_0
+; CHECK-NEXT:    vmaxnm.f32 s5, s0, s6
+; CHECK-NEXT:    vmaxnm.f32 s14, s12, s6
+; CHECK-NEXT:    vminnm.f32 s5, s5, s4
+; CHECK-NEXT:    vcvtt.f32.f16 s8, s1
+; CHECK-NEXT:    vminnm.f32 s14, s14, s4
+; CHECK-NEXT:    vcvt.s32.f32 s5, s5
+; CHECK-NEXT:    vmaxnm.f32 s10, s8, s6
+; CHECK-NEXT:    vcvt.s32.f32 s14, s14
+; CHECK-NEXT:    vminnm.f32 s10, s10, s4
+; CHECK-NEXT:    vcvtb.f32.f16 s1, s1
+; CHECK-NEXT:    vcvt.s32.f32 s10, s10
+; CHECK-NEXT:    vmaxnm.f32 s7, s1, s6
+; CHECK-NEXT:    vminnm.f32 s7, s7, s4
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    vcvt.s32.f32 s7, s7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s30
+; CHECK-NEXT:    vcmp.f32 s12, s12
+; CHECK-NEXT:    mov.w r7, #0
+; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s2
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r5, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r5, #-2147483648
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    bfc r2, #19, #13
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s30
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r7, #-2147483648
-; CHECK-NEXT:    vcmp.f32 s24, s28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r7, #-2147483648
-; CHECK-NEXT:    vcmp.f32 s24, s24
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vcmp.f32 s8, s8
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsll r2, r7, #19
+; CHECK-NEXT:    bfc r1, #19, #13
+; CHECK-NEXT:    vmov r12, s10
+; CHECK-NEXT:    vcmp.f32 s1, s1
+; CHECK-NEXT:    vmaxnm.f32 s8, s0, s6
+; CHECK-NEXT:    orr.w r1, r1, r2
+; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    vmov q5[3], q5[1], r7, r5
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s18
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vcmp.f32 s17, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s17, s28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s17, s17
+; CHECK-NEXT:    movvs.w r12, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s6
+; CHECK-NEXT:    vminnm.f32 s8, s8, s4
+; CHECK-NEXT:    vminnm.f32 s2, s2, s4
+; CHECK-NEXT:    vmov r3, s7
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    vmov q6[2], q6[0], r0, r6
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s18
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vcmp.f32 s16, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    vcvt.s32.f32 s8, s8
+; CHECK-NEXT:    bfc r3, #19, #13
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    bfc r2, #19, #13
+; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    lsrl r2, r1, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s30
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    lsrl r4, r9, #26
+; CHECK-NEXT:    vcvtt.f32.f16 s0, s3
+; CHECK-NEXT:    mov lr, r1
+; CHECK-NEXT:    orr.w r1, r4, r2
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s6
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vminnm.f32 s2, s2, s4
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r4, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r4, #-2147483648
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
+; CHECK-NEXT:    bfc r2, #19, #13
+; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    lsll r2, r5, #12
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s30
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    orrs r2, r1
+; CHECK-NEXT:    bfc r4, #19, #13
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    lsll r4, r1, #31
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    orrs r2, r4
+; CHECK-NEXT:    str r2, [r0, #8]
+; CHECK-NEXT:    orr.w r2, r7, r3, lsl #6
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s3
+; CHECK-NEXT:    orr.w r3, r2, r12, lsl #25
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmaxnm.f32 s2, s0, s6
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r7, #-2147483648
-; CHECK-NEXT:    vcmp.f32 s17, s28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r7, #-2147483648
-; CHECK-NEXT:    vcmp.f32 s17, s17
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    vmov q6[3], q6[1], r7, r4
-; CHECK-NEXT:    bl __aeabi_f2lz
-; CHECK-NEXT:    vcmp.f32 s18, s30
-; CHECK-NEXT:    vmov q3[2], q3[0], r11, r9
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s30
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r5, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r5, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s30
+; CHECK-NEXT:    vminnm.f32 s2, s2, s4
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r1, #-2147483648
-; CHECK-NEXT:    vcmp.f32 s18, s28
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r6
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r1, #-2147483648
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    vmov q3[3], q3[1], r10, r8
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    vcvt.s32.f32 s2, s2
+; CHECK-NEXT:    bfc r2, #19, #13
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    vcmp.f32 s0, s0
+; CHECK-NEXT:    lsll r2, r7, #5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    vmov r7, s2
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r5
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vmov q1, q6
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    bfc r4, #19, #13
+; CHECK-NEXT:    lsrl r4, r11, #14
+; CHECK-NEXT:    orrs r2, r4
+; CHECK-NEXT:    strh r2, [r0, #16]
+; CHECK-NEXT:    str r3, [r0, #4]
+; CHECK-NEXT:    lsrs r2, r2, #16
+; CHECK-NEXT:    strb r2, [r0, #18]
+; CHECK-NEXT:    orr.w r2, r9, lr
+; CHECK-NEXT:    orrs r2, r5
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    orr.w r1, r1, r7, lsl #18
+; CHECK-NEXT:    str r1, [r0, #12]
+; CHECK-NEXT:    pop.w {r4, r5, r7, r9, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI49_0:
-; CHECK-NEXT:    .long 0x5effffff @ float 9.22337149E+18
-; CHECK-NEXT:  .LCPI49_1:
-; CHECK-NEXT:    .long 0xdf000000 @ float -9.22337203E+18
-    %x = call <8 x i64> @llvm.fptosi.sat.v8f16.v8i64(<8 x half> %f)
-    ret <8 x i64> %x
+; CHECK-NEXT:  .LCPI46_0:
+; CHECK-NEXT:    .long 0x487fffc0 @ float 262143
+; CHECK-NEXT:  .LCPI46_1:
+; CHECK-NEXT:    .long 0xc8800000 @ float -262144
+    %x = call <8 x i19> @llvm.fptosi.sat.v8f16.v8i19(<8 x half> %f)
+    ret <8 x i19> %x
 }
 
-define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
-; CHECK-LABEL: test_signed_v8f16_v8i100:
+define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32_duplicate(<8 x half> %f) {
+; CHECK-LABEL: test_signed_v8f16_v8i32_duplicate:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vmovx.f16 s6, s0
+; CHECK-NEXT:    vcvt.s32.f16 s8, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vcvt.s32.f16 s10, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vcvt.s32.f16 s14, s2
+; CHECK-NEXT:    vcvt.s32.f16 s2, s1
+; CHECK-NEXT:    vcvt.s32.f16 s0, s0
+; CHECK-NEXT:    vcvt.s32.f16 s4, s4
+; CHECK-NEXT:    vcvt.s32.f16 s6, s6
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vcvt.s32.f16 s12, s3
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    bx lr
+    %x = call <8 x i32> @llvm.fptosi.sat.v8f16.v8i32(<8 x half> %f)
+    ret <8 x i32> %x
+}
+
+define arm_aapcs_vfpcc <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) {
+; CHECK-LABEL: test_signed_v8f16_v8i50:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
@@ -5417,996 +4561,1039 @@ define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    vcvtb.f32.f16 s21, s19
-; CHECK-NEXT:    vcvtt.f32.f16 s24, s19
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s16
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s17
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcvtb.f32.f16 s28, s17
-; CHECK-NEXT:    vcvtb.f32.f16 s30, s18
-; CHECK-NEXT:    vldr s20, .LCPI50_2
-; CHECK-NEXT:    vmov r8, s24
-; CHECK-NEXT:    vmov r9, s26
-; CHECK-NEXT:    vcvtt.f32.f16 s22, s18
-; CHECK-NEXT:    vmov r6, s28
-; CHECK-NEXT:    vmov r7, s30
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vldr s18, .LCPI50_3
-; CHECK-NEXT:    mov r5, r3
-; CHECK-NEXT:    vcmp.f32 s21, s18
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s21, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s21, s21
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s21, s18
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcvtt.f32.f16 s30, s16
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vmov r0, s30
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s18
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vldr s20, .LCPI48_0
+; CHECK-NEXT:    vldr s22, .LCPI48_1
+; CHECK-NEXT:    vcmp.f32 s30, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s21, s20
-; CHECK-NEXT:    str.w r2, [r11, #83]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movtlt r1, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s21, s21
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s21, s18
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r11, #79]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s21, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s21, s21
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r11, #75]
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    mov r7, r3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    vcmp.f32 s30, s22
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #1
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    vcmp.f32 s28, s20
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    str.w r2, [r11, #58]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s18
+; CHECK-NEXT:    str r2, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r11, #54]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r11, #50]
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s28, s18
-; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    ittt lt
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s18
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str.w r2, [r11, #33]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s18
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r5, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r11, #29]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    ittt lt
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movtlt r6, #65534
+; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    vcmp.f32 s24, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r6, #65535
+; CHECK-NEXT:    movtgt r6, #1
+; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r11, #25]
-; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s26, s18
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s26
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s18
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    str.w r8, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    str r6, [sp] @ 4-byte Spill
+; CHECK-NEXT:    bl __aeabi_f2lz
 ; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str.w r2, [r11, #8]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s26
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s18
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r11, #4]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    ittt lt
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movtlt r6, #65534
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    vcmp.f32 s26, s22
+; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r6, #65535
+; CHECK-NEXT:    movtgt r6, #1
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str.w r0, [r11]
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s24, s18
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s24
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s21, s18
+; CHECK-NEXT:    str.w r0, [r7, #25]
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    mov r8, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r5, #7
-; CHECK-NEXT:    vcmp.f32 s21, s20
 ; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    ittt lt
+; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movwlt r9, #0
+; CHECK-NEXT:    movtlt r9, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #7
-; CHECK-NEXT:    vcmp.f32 s21, s21
-; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s19
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r9, #65535
+; CHECK-NEXT:    movtgt r9, #1
+; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    and r0, r5, #15
-; CHECK-NEXT:    orr.w r1, r0, r6, lsl #4
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    str.w r1, [r11, #87]
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    movvs.w r9, #0
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    ittt lt
 ; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movwlt r11, #0
+; CHECK-NEXT:    movtlt r11, #65534
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s22
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    vcvtt.f32.f16 s18, s19
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r11, #65535
+; CHECK-NEXT:    movtgt r11, #1
 ; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s16
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    and r0, r7, #15
-; CHECK-NEXT:    orr.w r0, r0, r10, lsl #4
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s17
-; CHECK-NEXT:    str.w r0, [r11, #62]
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r3, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s18
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r0, #7
+; CHECK-NEXT:    ittt lt
+; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movtlt r5, #65534
+; CHECK-NEXT:    vcmp.f32 s18, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r0, #7
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #1
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    and r0, r0, #15
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    str.w r0, [r11, #37]
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s16, s18
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    bfc r11, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    ittt lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movtlt r1, #65534
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    vcmp.f32 s16, s22
+; CHECK-NEXT:    mov r2, r10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #1
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    lsrl r2, r11, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s18
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
-; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    str r0, [r7]
+; CHECK-NEXT:    lsrs r0, r5, #10
+; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    bfc r9, #18, #14
+; CHECK-NEXT:    lsll r4, r5, #22
+; CHECK-NEXT:    bfc r6, #18, #14
+; CHECK-NEXT:    orr.w r3, r11, r5
+; CHECK-NEXT:    str.w r3, [r7, #45]
+; CHECK-NEXT:    orrs r2, r4
+; CHECK-NEXT:    str.w r2, [r7, #41]
+; CHECK-NEXT:    strb.w r0, [r7, #49]
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    lsrl r0, r9, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    orr.w r2, r9, r10, lsl #4
+; CHECK-NEXT:    str.w r2, [r7, #37]
+; CHECK-NEXT:    str.w r0, [r7, #33]
+; CHECK-NEXT:    orr.w r0, r6, r8, lsl #18
+; CHECK-NEXT:    str.w r0, [r7, #29]
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    lsr.w r0, r3, #10
+; CHECK-NEXT:    bfc r3, #18, #14
+; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    lsll r6, r3, #22
+; CHECK-NEXT:    lsrl r2, r5, #28
+; CHECK-NEXT:    orr.w r3, r3, r5
+; CHECK-NEXT:    str r3, [r7, #20]
+; CHECK-NEXT:    orr.w r2, r2, r6
+; CHECK-NEXT:    str r2, [r7, #16]
+; CHECK-NEXT:    strb r0, [r7, #24]
+; CHECK-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    bfc r3, #18, #14
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    lsrl r0, r3, #14
+; CHECK-NEXT:    orr.w r2, r3, r4, lsl #4
+; CHECK-NEXT:    strd r0, r2, [r7, #8]
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    vcmp.f32 s24, s18
-; CHECK-NEXT:    and r7, r7, #15
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
-; CHECK-NEXT:    str.w r7, [r11, #12]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    b.w .LBB50_3
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    orr.w r0, r1, r6, lsl #18
+; CHECK-NEXT:    str r0, [r7, #4]
+; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI50_2:
-; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:  .LCPI50_3:
-; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
-; CHECK-NEXT:    .p2align 1
-; CHECK-NEXT:  .LBB50_3:
-; CHECK-NEXT:    vcmp.f32 s24, s24
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s18
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs.w r9, #0
+; CHECK-NEXT:  .LCPI48_0:
+; CHECK-NEXT:    .long 0xd8000000 @ float -5.62949953E+14
+; CHECK-NEXT:  .LCPI48_1:
+; CHECK-NEXT:    .long 0x57ffffff @ float 5.6294992E+14
+    %x = call <8 x i50> @llvm.fptosi.sat.v8f16.v8i50(<8 x half> %f)
+    ret <8 x i50> %x
+}
+
+define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
+; CHECK-LABEL: test_signed_v8f16_v8i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s19
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s19
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vldr s30, .LCPI49_0
+; CHECK-NEXT:    vldr s28, .LCPI49_1
+; CHECK-NEXT:    mov r8, r1
+; CHECK-NEXT:    vcmp.f32 s24, s30
+; CHECK-NEXT:    vcvtt.f32.f16 s22, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r8, #-2147483648
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    vcmp.f32 s24, s28
+; CHECK-NEXT:    vcvtt.f32.f16 s20, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r6, r9, #28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r9, #-1
+; CHECK-NEXT:    mvngt r8, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    vmov r6, s20
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs.w r8, #0
-; CHECK-NEXT:    orr.w r7, r9, r8, lsl #4
-; CHECK-NEXT:    str.w r7, [r11, #95]
-; CHECK-NEXT:    str.w r6, [r11, #91]
-; CHECK-NEXT:    vcmp.f32 s24, s18
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    movvs.w r9, #0
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    vcmp.f32 s26, s30
+; CHECK-NEXT:    mov r11, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
-; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt.w r11, #-2147483648
+; CHECK-NEXT:    vcmp.f32 s26, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
-; CHECK-NEXT:    vcmp.f32 s24, s24
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    mvngt r11, #-2147483648
+; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    and r7, r7, #15
-; CHECK-NEXT:    vcmp.f32 s22, s18
-; CHECK-NEXT:    lsrl r8, r7, #28
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs.w r10, #0
+; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vcmp.f32 s22, s30
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r8, [r11, #99]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r4, #-2147483648
 ; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    vcmp.f32 s22, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    mvngt r4, #-2147483648
 ; CHECK-NEXT:    vcmp.f32 s22, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r4, #0
 ; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    vcmp.f32 s22, s18
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s16
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    vcmp.f32 s20, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    lsrl r10, r5, #28
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r6, #-2147483648
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s20, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vcmp.f32 s22, s22
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    mvngt r6, #-2147483648
+; CHECK-NEXT:    vcmp.f32 s20, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
-; CHECK-NEXT:    str.w r7, [r11, #70]
-; CHECK-NEXT:    str.w r10, [r11, #66]
-; CHECK-NEXT:    vcmp.f32 s22, s18
-; CHECK-NEXT:    ldr r7, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
-; CHECK-NEXT:    vcmp.f32 s22, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    and r5, r7, #15
-; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    lsrl r6, r5, #28
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s16, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r5, r4
-; CHECK-NEXT:    strb.w r6, [r11, #74]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt.w r1, #-2147483648
+; CHECK-NEXT:    vcmp.f32 s16, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
-; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    mvngt r1, #-2147483648
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r5, #0
-; CHECK-NEXT:    ldr r4, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r4, r5, #28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s17
+; CHECK-NEXT:    vmov q5[2], q5[0], r0, r7
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r6, #0
-; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
-; CHECK-NEXT:    str.w r7, [r11, #45]
-; CHECK-NEXT:    str.w r4, [r11, #41]
-; CHECK-NEXT:    vcmp.f32 s30, s18
-; CHECK-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q5[3], q5[1], r1, r6
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s16, s30
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r7, #7
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    vcmp.f32 s16, s28
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r6, #-2147483648
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #7
-; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s17
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    mvngt r6, #-2147483648
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r6, #0
 ; CHECK-NEXT:    movvs r7, #0
-; CHECK-NEXT:    and r5, r7, #15
-; CHECK-NEXT:    vcmp.f32 s16, s18
-; CHECK-NEXT:    lsrl r6, r5, #28
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s16, s30
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    strb.w r6, [r11, #49]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt.w r1, #-2147483648
+; CHECK-NEXT:    vcmp.f32 s16, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    mvngt r1, #-2147483648
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s18
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r0, r1, #28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vcmp.f32 s16, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s18
+; CHECK-NEXT:    vmov q6[2], q6[0], r0, r7
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
-; CHECK-NEXT:    vcmp.f32 s16, s18
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q6[3], q6[1], r1, r6
+; CHECK-NEXT:    bl __aeabi_f2lz
+; CHECK-NEXT:    vcmp.f32 s16, s30
+; CHECK-NEXT:    vmov q3[2], q3[0], r10, r9
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    strd r0, r1, [r11, #16]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt.w r1, #-2147483648
+; CHECK-NEXT:    vcmp.f32 s16, s28
+; CHECK-NEXT:    vmov q3[3], q3[1], r11, r8
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    mvngt r1, #-2147483648
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    vmov q0, q5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    and r1, r3, #15
-; CHECK-NEXT:    lsrl r2, r1, #28
-; CHECK-NEXT:    strb.w r2, [r11, #24]
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r5
+; CHECK-NEXT:    vmov q1, q6
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r4
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-; CHECK-NEXT:  @ %bb.4:
-    %x = call <8 x i100> @llvm.fptosi.sat.v8f16.v8i100(<8 x half> %f)
-    ret <8 x i100> %x
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI49_0:
+; CHECK-NEXT:    .long 0xdf000000 @ float -9.22337203E+18
+; CHECK-NEXT:  .LCPI49_1:
+; CHECK-NEXT:    .long 0x5effffff @ float 9.22337149E+18
+    %x = call <8 x i64> @llvm.fptosi.sat.v8f16.v8i64(<8 x half> %f)
+    ret <8 x i64> %x
 }
 
-define arm_aapcs_vfpcc <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
-; CHECK-LABEL: test_signed_v8f16_v8i128:
+define arm_aapcs_vfpcc <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
+; CHECK-LABEL: test_signed_v8f16_v8i100:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .pad #56
+; CHECK-NEXT:    sub sp, #56
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s19
-; CHECK-NEXT:    vcvtb.f32.f16 s20, s16
-; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    vcvtt.f32.f16 s24, s16
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s17
-; CHECK-NEXT:    vcvtb.f32.f16 s19, s19
-; CHECK-NEXT:    vldr s22, .LCPI51_2
-; CHECK-NEXT:    vmov r8, s20
-; CHECK-NEXT:    vmov r9, s24
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s18
-; CHECK-NEXT:    vmov r7, s26
-; CHECK-NEXT:    vmov r6, s19
+; CHECK-NEXT:    vmov r0, s24
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vldr s16, .LCPI51_3
-; CHECK-NEXT:    vmov r5, s30
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s18
-; CHECK-NEXT:    vcmp.f32 s28, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r3, #-2147483648
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s16
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    vcvtt.f32.f16 s26, s17
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vldr s22, .LCPI50_0
+; CHECK-NEXT:    vldr s20, .LCPI50_1
+; CHECK-NEXT:    vcmp.f32 s24, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    str r3, [r4, #124]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    mvnlt r3, #7
 ; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
 ; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s16
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    str r3, [sp, #52] @ 4-byte Spill
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    str r2, [sp, #48] @ 4-byte Spill
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    str r7, [sp, #40] @ 4-byte Spill
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    str r1, [sp, #44] @ 4-byte Spill
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s26, s22
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    str r2, [r4, #120]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s26, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    vcmp.f32 s26, s26
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s16
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    str r3, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    str r2, [sp, #32] @ 4-byte Spill
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s24, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #116]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s24, s24
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #112]
-; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    vcvtb.f32.f16 s24, s17
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s19, s16
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s18
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r8, r1
+; CHECK-NEXT:    vcmp.f32 s24, s22
+; CHECK-NEXT:    mov r6, r2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r3, #-2147483648
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    vcmp.f32 s24, s24
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s16
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    movvs.w r8, #0
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    str r3, [r4, #108]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s19
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    movgt r3, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s16
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    movvs.w r11, #0
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    mov r9, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    str r2, [r4, #104]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    mvnlt r9, #7
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s19
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r9, #7
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s16
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    str.w r2, [r10, #83]
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #100]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s19, s19
+; CHECK-NEXT:    str.w r1, [r10, #79]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #96]
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    vmov r6, s18
+; CHECK-NEXT:    vcvtt.f32.f16 s18, s19
+; CHECK-NEXT:    str.w r0, [r10, #75]
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    str.w r4, [r10, #58]
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    str.w r11, [r10, #54]
+; CHECK-NEXT:    str.w r7, [r10, #50]
+; CHECK-NEXT:    str.w r6, [r10, #33]
+; CHECK-NEXT:    str.w r8, [r10, #29]
+; CHECK-NEXT:    str.w r5, [r10, #25]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs.w r9, #0
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s30, s16
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s16
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    mov r4, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s22
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    mvnlt r4, #7
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    mvngt r3, #-2147483648
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    movgt r4, #7
+; CHECK-NEXT:    vcmp.f32 s18, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s16
-; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    itttt vs
+; CHECK-NEXT:    movvs r4, #0
+; CHECK-NEXT:    movvs r6, #0
+; CHECK-NEXT:    movvs r7, #0
+; CHECK-NEXT:    movvs r5, #0
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s22
-; CHECK-NEXT:    str r3, [r4, #92]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    mvnlt r3, #7
+; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt r3, #7
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vcmp.f32 s16, s16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s16
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s22
-; CHECK-NEXT:    str r2, [r4, #88]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s16
+; CHECK-NEXT:    str.w r2, [r10, #8]
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #84]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s30, s30
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    str.w r1, [r10, #4]
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #80]
-; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    str.w r0, [r10]
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    lsrl r0, r7, #28
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    orr.w r1, r7, r6, lsl #4
+; CHECK-NEXT:    str.w r1, [r10, #95]
+; CHECK-NEXT:    and r1, r4, #15
+; CHECK-NEXT:    str.w r0, [r10, #91]
+; CHECK-NEXT:    and r0, r9, #15
+; CHECK-NEXT:    lsrl r6, r1, #28
+; CHECK-NEXT:    strb.w r6, [r10, #99]
+; CHECK-NEXT:    orr.w r0, r0, r5, lsl #4
+; CHECK-NEXT:    str.w r0, [r10, #87]
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    str.w r1, [r10, #70]
+; CHECK-NEXT:    str.w r0, [r10, #66]
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r10, #74]
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
+; CHECK-NEXT:    str.w r0, [r10, #62]
+; CHECK-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    str.w r1, [r10, #45]
+; CHECK-NEXT:    str.w r0, [r10, #41]
+; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r10, #49]
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
+; CHECK-NEXT:    str.w r0, [r10, #37]
+; CHECK-NEXT:    ldr r7, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #48] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r10, #16]
+; CHECK-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r10, #24]
+; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    and r0, r3, #15
+; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
+; CHECK-NEXT:    str.w r0, [r10, #12]
+; CHECK-NEXT:    add sp, #56
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI50_0:
+; CHECK-NEXT:    .long 0xf1000000 @ float -6.338253E+29
+; CHECK-NEXT:  .LCPI50_1:
+; CHECK-NEXT:    .long 0x70ffffff @ float 6.33825262E+29
+    %x = call <8 x i100> @llvm.fptosi.sat.v8f16.v8i100(<8 x half> %f)
+    ret <8 x i100> %x
+}
+
+define arm_aapcs_vfpcc <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
+; CHECK-LABEL: test_signed_v8f16_v8i128:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vcvtt.f32.f16 s26, s19
+; CHECK-NEXT:    vcvtb.f32.f16 s28, s19
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vcvtb.f32.f16 s24, s17
+; CHECK-NEXT:    vldr s20, .LCPI51_0
 ; CHECK-NEXT:    vmov r5, s28
+; CHECK-NEXT:    vmov r8, s24
+; CHECK-NEXT:    vcvtt.f32.f16 s30, s18
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s18, s16
+; CHECK-NEXT:    vldr s22, .LCPI51_1
+; CHECK-NEXT:    add.w r12, r4, #112
+; CHECK-NEXT:    vmov r6, s30
+; CHECK-NEXT:    vcmp.f32 s26, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s26, s26
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s16
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    str r3, [r4, #76]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s16
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    ittt vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    str r2, [r4, #72]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s16
-; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #68]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s18, s18
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #64]
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
 ; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s18
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s28, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    add.w r12, r4, #96
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s16
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    str r3, [r4, #60]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s16
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    ittt vs
 ; CHECK-NEXT:    movvs r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s22
-; CHECK-NEXT:    str r2, [r4, #56]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s28
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s16
-; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    vmov r7, s26
+; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s30, s22
+; CHECK-NEXT:    add.w r12, r4, #80
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #52]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s30, s30
+; CHECK-NEXT:    itttt gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #48]
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
 ; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    vmov r5, s28
+; CHECK-NEXT:    vcvtt.f32.f16 s18, s16
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s26, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    add.w r12, r4, #64
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s26, s26
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s16
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    str r3, [r4, #44]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s26
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s16
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    str r2, [r4, #40]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s26
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s16
-; CHECK-NEXT:    b.w .LBB51_3
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI51_2:
-; CHECK-NEXT:    .long 0x7effffff @ float 1.70141173E+38
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:  .LCPI51_3:
-; CHECK-NEXT:    .long 0xff000000 @ float -1.70141183E+38
-; CHECK-NEXT:    .p2align 1
-; CHECK-NEXT:  .LBB51_3:
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    itt vs
 ; CHECK-NEXT:    movvs r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #36]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s26, s26
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #32]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s16
+; CHECK-NEXT:    vmov r6, s18
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s24, s16
+; CHECK-NEXT:    vcmp.f32 s28, s22
+; CHECK-NEXT:    add.w r12, r4, #48
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s22
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s24
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s28, s28
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s16
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s22
-; CHECK-NEXT:    str r3, [r4, #28]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s24
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s16
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt vs
+; CHECK-NEXT:    movvs r1, #0
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    vmov r7, s16
+; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    vcmp.f32 s24, s22
-; CHECK-NEXT:    str r2, [r4, #24]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    add.w r12, r4, #32
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s24
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s16
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #20]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s24, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s24, s24
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4, #16]
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    vcmp.f32 s20, s16
+; CHECK-NEXT:    vcmp.f32 s18, s22
+; CHECK-NEXT:    add.w r12, r4, #16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s22
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s20
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s18, s18
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s16
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    ittt vs
 ; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s22
-; CHECK-NEXT:    str r3, [r4, #12]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s20
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s16
 ; CHECK-NEXT:    it vs
-; CHECK-NEXT:    movvs r2, #0
+; CHECK-NEXT:    movvs r0, #0
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    vcmp.f32 s16, s22
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s22
-; CHECK-NEXT:    str r2, [r4, #8]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r3, #-2147483648
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s20
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vcmp.f32 s16, s16
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    mvngt r3, #-2147483648
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s16
-; CHECK-NEXT:    it vs
+; CHECK-NEXT:    ittt vs
+; CHECK-NEXT:    movvs r3, #0
+; CHECK-NEXT:    movvs r2, #0
 ; CHECK-NEXT:    movvs r1, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s20, s22
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s20, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it vs
 ; CHECK-NEXT:    movvs r0, #0
-; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    stm r4!, {r0, r1, r2, r3}
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
-; CHECK-NEXT:  @ %bb.4:
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI51_0:
+; CHECK-NEXT:    .long 0x7effffff @ float 1.70141173E+38
+; CHECK-NEXT:  .LCPI51_1:
+; CHECK-NEXT:    .long 0xff000000 @ float -1.70141183E+38
     %x = call <8 x i128> @llvm.fptosi.sat.v8f16.v8i128(<8 x half> %f)
     ret <8 x i128> %x
 }
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 5ab184a066e49..13609bd1903f2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -39,40 +39,28 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f32_v2i32(<2 x float> %f) {
 ; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    vldr s18, .LCPI1_0
 ; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s18
-; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt r4, #0
 ; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s18
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s18
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s18
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt r1, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r4
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -1059,38 +1047,26 @@ define arm_aapcs_vfpcc <2 x i32> @test_unsigned_v2f16_v2i32(<2 x half> %f) {
 ; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt r4, #0
 ; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt r1, #0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r4
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -1504,112 +1480,91 @@ define arm_aapcs_vfpcc <4 x i50> @test_unsigned_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10}
 ; CHECK-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r9, r0
 ; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    mov r9, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vmov r4, s19
-; CHECK-NEXT:    vldr s20, .LCPI28_0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    vldr s20, .LCPI28_0
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r10, r0
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r5, #65535
-; CHECK-NEXT:    movtgt r5, #3
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r8, #0
 ; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movwgt r7, #65535
 ; CHECK-NEXT:    movtgt r7, #3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str.w r6, [r8]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r1, r7
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movwgt r10, #65535
+; CHECK-NEXT:    movtgt r10, #3
 ; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    mov r5, r1
+; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    mov r1, r10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #-1
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    bfc r5, #18, #14
-; CHECK-NEXT:    mov r6, r10
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    str.w r0, [r9]
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    bfc r7, #18, #14
+; CHECK-NEXT:    mov r6, r8
 ; CHECK-NEXT:    lsll r4, r1, #22
-; CHECK-NEXT:    lsrl r6, r5, #28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    orrs r1, r5
+; CHECK-NEXT:    lsrl r6, r7, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r9, #65535
-; CHECK-NEXT:    movtgt r9, #3
-; CHECK-NEXT:    str.w r1, [r8, #20]
+; CHECK-NEXT:    movwgt r5, #65535
+; CHECK-NEXT:    movtgt r5, #3
+; CHECK-NEXT:    orrs r1, r7
+; CHECK-NEXT:    str.w r1, [r9, #20]
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    orr.w r2, r6, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    bfc r9, #18, #14
+; CHECK-NEXT:    bfc r5, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #3
-; CHECK-NEXT:    str.w r2, [r8, #16]
-; CHECK-NEXT:    lsrs r2, r7, #10
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    strb.w r2, [r8, #24]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    str.w r2, [r9, #16]
+; CHECK-NEXT:    lsr.w r2, r10, #10
+; CHECK-NEXT:    strb.w r2, [r9, #24]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    orr.w r0, r9, r0, lsl #18
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    orr.w r0, r5, r0, lsl #18
 ; CHECK-NEXT:    lsrl r2, r1, #14
-; CHECK-NEXT:    orr.w r1, r1, r10, lsl #4
-; CHECK-NEXT:    strd r2, r1, [r8, #8]
-; CHECK-NEXT:    str.w r0, [r8, #4]
+; CHECK-NEXT:    orr.w r1, r1, r8, lsl #4
+; CHECK-NEXT:    strd r2, r1, [r9, #8]
+; CHECK-NEXT:    str.w r0, [r9, #4]
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -1636,85 +1591,61 @@ define arm_aapcs_vfpcc <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) {
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vldr s20, .LCPI29_0
 ; CHECK-NEXT:    vcmp.f32 s19, #0
+; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    mov r10, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vmov r9, s17
 ; CHECK-NEXT:    vmov r8, s16
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r10, #-1
 ; CHECK-NEXT:    movgt.w r11, #-1
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    mov r0, r9
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r7, #-1
 ; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vcmp.f32 s17, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r4, #-1
 ; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmov q1[2], q1[0], r7, r11
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
+; CHECK-NEXT:    vmov q1[3], q1[1], r6, r10
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r6, r10
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1735,179 +1666,125 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10}
 ; CHECK-NEXT:    vpush {d8, d9, d10}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vmov r0, s17
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov r9, r3
-; CHECK-NEXT:    vmov r3, s16
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vldr s20, .LCPI30_0
+; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    vcmp.f32 s17, s20
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r3, #15
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
+; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r10, #0
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str.w r2, [r8, #33]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    str.w r2, [r4, #33]
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r8, #29]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    str.w r1, [r4, #29]
+; CHECK-NEXT:    vmov r1, s19
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r8, #25]
-; CHECK-NEXT:    vmov r7, s17
-; CHECK-NEXT:    vmov r4, s19
-; CHECK-NEXT:    mov r0, r3
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    mov r10, r3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    str.w r0, [r4, #25]
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r10, #15
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vcmp.f32 s19, #0
+; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    mov r11, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str.w r2, [r8, #8]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r11, #15
+; CHECK-NEXT:    movgt.w r8, #-1
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    movgt.w r9, #-1
+; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r8, #4]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r8]
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r9, #15
-; CHECK-NEXT:    and r0, r9, #15
-; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    orr.w r0, r0, r4, lsl #4
-; CHECK-NEXT:    str.w r0, [r8, #37]
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    mov r11, r3
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #15
-; CHECK-NEXT:    and r7, r10, #15
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
-; CHECK-NEXT:    str.w r7, [r8, #12]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r4, r5, #28
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
-; CHECK-NEXT:    str.w r7, [r8, #45]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r4, [r8, #41]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #15
-; CHECK-NEXT:    and r5, r11, #15
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    lsrl r6, r5, #28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r6, [r8, #49]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    str r2, [r4, #8]
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    lsrl r0, r1, #28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    str r1, [r4, #4]
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    strd r0, r1, [r8, #16]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    lsrl r0, r9, #28
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    orr.w r1, r9, r8, lsl #4
+; CHECK-NEXT:    str.w r1, [r4, #45]
+; CHECK-NEXT:    and r1, r11, #15
+; CHECK-NEXT:    str.w r0, [r4, #41]
+; CHECK-NEXT:    and r0, r10, #15
+; CHECK-NEXT:    lsrl r8, r1, #28
+; CHECK-NEXT:    strb.w r8, [r4, #49]
+; CHECK-NEXT:    orr.w r0, r0, r5, lsl #4
+; CHECK-NEXT:    str.w r0, [r4, #37]
+; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    lsrl r0, r7, #28
+; CHECK-NEXT:    orr.w r1, r7, r6, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r4, #16]
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r6, r1, #28
+; CHECK-NEXT:    strb r6, [r4, #24]
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    and r1, r3, #15
-; CHECK-NEXT:    lsrl r2, r1, #28
-; CHECK-NEXT:    strb.w r2, [r8, #24]
+; CHECK-NEXT:    and r0, r3, #15
+; CHECK-NEXT:    orr.w r0, r0, r2, lsl #4
+; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1932,160 +1809,87 @@ define arm_aapcs_vfpcc <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmov r0, s19
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vldr s20, .LCPI31_0
 ; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    vcmp.f32 s19, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r3, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str r3, [r4, #60]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    str r2, [r4, #56]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s19, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #52]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s19, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #48]
-; CHECK-NEXT:    vmov r7, s16
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    strd r5, r1, [r4, #48]
 ; CHECK-NEXT:    vmov r6, s17
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    vmov r7, s16
+; CHECK-NEXT:    strd r2, r3, [r4, #56]
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    add.w r12, r4, #32
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str r3, [r4, #44]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str r2, [r4, #40]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #36]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #32]
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s17, #0
+; CHECK-NEXT:    add.w r12, r4, #16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    str r3, [r4, #28]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    str r2, [r4, #24]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #20]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s17, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #16]
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
 ; CHECK-NEXT:    mov r0, r7
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r3, [r4, #12]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r2, [r4, #8]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    stm r4!, {r0, r1, r2, r3}
 ; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
@@ -3762,232 +3566,198 @@ define arm_aapcs_vfpcc <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vcvtb.f32.f16 s22, s17
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcvtb.f32.f16 s24, s18
+; CHECK-NEXT:    mov r2, r0
 ; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vcvtt.f32.f16 s20, s18
+; CHECK-NEXT:    vldr s18, .LCPI48_0
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    vcvtt.f32.f16 s26, s17
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    vcmp.f32 s22, s18
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vmov r5, s26
+; CHECK-NEXT:    str r2, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    vmov r6, s20
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s19
 ; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    vcvtb.f32.f16 s22, s16
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s19
 ; CHECK-NEXT:    vcmp.f32 s24, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r9, r1
-; CHECK-NEXT:    vmov r5, s22
-; CHECK-NEXT:    vldr s20, .LCPI48_0
-; CHECK-NEXT:    vmov r11, s26
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    vcmp.f32 s24, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r4, #65535
-; CHECK-NEXT:    movtgt r4, #3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    str.w r7, [r10, #25]
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s26, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movwgt r1, #65535
+; CHECK-NEXT:    movtgt r1, #3
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str.w r7, [r4, #25]
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    mov r7, r4
-; CHECK-NEXT:    str.w r0, [r10]
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s20, #0
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s20, s18
+; CHECK-NEXT:    vcvtb.f32.f16 s20, s19
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    bfc r7, #18, #14
-; CHECK-NEXT:    lsll r6, r7, #22
+; CHECK-NEXT:    movwgt r7, #65535
+; CHECK-NEXT:    movtgt r7, #3
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vcmp.f32 s20, #0
+; CHECK-NEXT:    mov r9, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    vcmp.f32 s20, s18
+; CHECK-NEXT:    vcvtt.f32.f16 s20, s19
+; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    movlt.w r9, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movgt.w r9, #-1
+; CHECK-NEXT:    movwgt r11, #65535
+; CHECK-NEXT:    movtgt r11, #3
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s20, #0
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r1, #65535
-; CHECK-NEXT:    movtgt r1, #3
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    vcvtt.f32.f16 s26, s18
-; CHECK-NEXT:    lsrl r2, r1, #28
-; CHECK-NEXT:    orr.w r0, r1, r7
-; CHECK-NEXT:    str.w r0, [r10, #45]
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    orrs r6, r2
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s17
-; CHECK-NEXT:    lsrs r0, r4, #10
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r1, #65535
-; CHECK-NEXT:    movtgt r1, #3
-; CHECK-NEXT:    str.w r6, [r10, #41]
-; CHECK-NEXT:    strb.w r0, [r10, #49]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    bfc r1, #18, #14
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:    lsrl r4, r1, #14
-; CHECK-NEXT:    orr.w r6, r1, r5, lsl #4
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcvtt.f32.f16 s26, s17
-; CHECK-NEXT:    mov r11, r0
-; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vcmp.f32 s20, s18
+; CHECK-NEXT:    vcvtb.f32.f16 s20, s16
 ; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movgt.w r10, #-1
 ; CHECK-NEXT:    movwgt r5, #65535
 ; CHECK-NEXT:    movtgt r5, #3
-; CHECK-NEXT:    str.w r6, [r10, #37]
-; CHECK-NEXT:    str.w r4, [r10, #33]
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    mov r6, r1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r6, #65535
-; CHECK-NEXT:    movtgt r6, #3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r9, #65535
-; CHECK-NEXT:    movtgt r9, #3
-; CHECK-NEXT:    bfc r9, #18, #14
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    orr.w r0, r9, r7, lsl #18
-; CHECK-NEXT:    str.w r0, [r10, #29]
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    mov r1, r6
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    vcmp.f32 s20, #0
+; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    vcmp.f32 s20, s18
+; CHECK-NEXT:    bfc r11, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    lsrs r0, r5, #10
 ; CHECK-NEXT:    bfc r5, #18, #14
+; CHECK-NEXT:    lsll r10, r5, #22
+; CHECK-NEXT:    lsrl r2, r11, #28
+; CHECK-NEXT:    orr.w r1, r11, r5
+; CHECK-NEXT:    str.w r1, [r4, #45]
+; CHECK-NEXT:    orr.w r1, r2, r10
+; CHECK-NEXT:    str.w r1, [r4, #41]
+; CHECK-NEXT:    strb.w r0, [r4, #49]
+; CHECK-NEXT:    bfc r7, #18, #14
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
+; CHECK-NEXT:    lsrl r0, r7, #14
+; CHECK-NEXT:    mov r5, r4
+; CHECK-NEXT:    orr.w r1, r7, r9, lsl #4
+; CHECK-NEXT:    str.w r1, [r4, #37]
+; CHECK-NEXT:    str.w r0, [r4, #33]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #-1
-; CHECK-NEXT:    mov r8, r11
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    bfc r0, #18, #14
+; CHECK-NEXT:    orr.w r0, r0, r6, lsl #18
+; CHECK-NEXT:    str.w r0, [r4, #29]
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr.w r9, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    bfc r3, #18, #14
+; CHECK-NEXT:    mov r6, r9
 ; CHECK-NEXT:    lsll r4, r1, #22
-; CHECK-NEXT:    lsrl r8, r5, #28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    lsrl r6, r3, #28
 ; CHECK-NEXT:    itt gt
-; CHECK-NEXT:    movwgt r7, #65535
-; CHECK-NEXT:    movtgt r7, #3
-; CHECK-NEXT:    orrs r1, r5
-; CHECK-NEXT:    str.w r1, [r10, #20]
+; CHECK-NEXT:    movwgt r8, #65535
+; CHECK-NEXT:    movtgt r8, #3
+; CHECK-NEXT:    orrs r1, r3
+; CHECK-NEXT:    str r1, [r5, #20]
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    orr.w r2, r8, r4
+; CHECK-NEXT:    orr.w r2, r6, r4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    bfc r7, #18, #14
+; CHECK-NEXT:    vcmp.f32 s16, s18
+; CHECK-NEXT:    bfc r8, #18, #14
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movwgt r1, #65535
 ; CHECK-NEXT:    movtgt r1, #3
-; CHECK-NEXT:    str.w r2, [r10, #16]
-; CHECK-NEXT:    lsrs r2, r6, #10
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    strb.w r2, [r10, #24]
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    str r2, [r5, #16]
+; CHECK-NEXT:    lsrs r2, r7, #10
+; CHECK-NEXT:    strb r2, [r5, #24]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    mov r2, r0
-; CHECK-NEXT:    orr.w r0, r7, r0, lsl #18
+; CHECK-NEXT:    bfc r1, #18, #14
+; CHECK-NEXT:    orr.w r0, r8, r0, lsl #18
 ; CHECK-NEXT:    lsrl r2, r1, #14
-; CHECK-NEXT:    orr.w r1, r1, r11, lsl #4
-; CHECK-NEXT:    strd r2, r1, [r10, #8]
-; CHECK-NEXT:    str.w r0, [r10, #4]
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    orr.w r1, r1, r9, lsl #4
+; CHECK-NEXT:    strd r2, r1, [r5, #8]
+; CHECK-NEXT:    str r0, [r5, #4]
+; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -4005,8 +3775,8 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtt.f32.f16 s20, s19
 ; CHECK-NEXT:    vmov r0, s20
@@ -4016,178 +3786,128 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-NEXT:    vmov r0, s22
 ; CHECK-NEXT:    vldr s28, .LCPI49_0
 ; CHECK-NEXT:    vcmp.f32 s20, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s24, s16
+; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcvtb.f32.f16 s16, s16
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s18
+; CHECK-NEXT:    vcvtt.f32.f16 s26, s16
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    movlt.w r8, #0
 ; CHECK-NEXT:    vcmp.f32 s20, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r8, r1
-; CHECK-NEXT:    vmov r5, s24
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    vmov r4, s24
+; CHECK-NEXT:    vmov r6, s26
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r8, #-1
 ; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    mov r10, r0
 ; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    mov r11, r0
+; CHECK-NEXT:    mov r11, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s28
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt.w r11, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s20, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r8, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r10, r1
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r8, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    vcmp.f32 s22, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt.w r10, #-1
+; CHECK-NEXT:    movgt.w r11, #-1
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    vcmp.f32 s24, #0
+; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r4, #0
 ; CHECK-NEXT:    vcmp.f32 s24, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcvtt.f32.f16 s30, s17
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r1, s30
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmov q5[2], q5[0], r0, r6
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    movgt.w r5, #-1
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s16
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s28
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    vcmp.f32 s26, s28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    vcvtb.f32.f16 s16, s17
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:    vmov q5[3], q5[1], r7, r5
 ; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcvtt.f32.f16 s17, s18
-; CHECK-NEXT:    mov r7, r1
-; CHECK-NEXT:    vmov r1, s17
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s28
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmov q6[2], q6[0], r0, r6
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __aeabi_f2ulz
-; CHECK-NEXT:    vcmp.f32 s17, #0
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    vcvtt.f32.f16 s16, s17
+; CHECK-NEXT:    vmov q5[2], q5[0], r0, r7
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmov q5[3], q5[1], r1, r6
+; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
+; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    vcvtb.f32.f16 s16, s18
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s17
+; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r6, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r6, #-1
 ; CHECK-NEXT:    movgt.w r7, #-1
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    vmov q6[3], q6[1], r7, r4
 ; CHECK-NEXT:    bl __aeabi_f2ulz
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    vmov q3[2], q3[0], r11, r9
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s17, s28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s18
+; CHECK-NEXT:    vmov q6[2], q6[0], r0, r7
+; CHECK-NEXT:    vmov r0, s16
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    vmov q6[3], q6[1], r1, r6
+; CHECK-NEXT:    bl __aeabi_f2ulz
+; CHECK-NEXT:    vcmp.f32 s16, #0
+; CHECK-NEXT:    vmov q3[2], q3[0], r10, r9
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itt lt
 ; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s28
-; CHECK-NEXT:    vmov q2[2], q2[0], r0, r6
+; CHECK-NEXT:    vmov q3[3], q3[1], r11, r8
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
+; CHECK-NEXT:    movgt.w r0, #-1
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r5
-; CHECK-NEXT:    vmov q3[3], q3[1], r10, r8
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r5
 ; CHECK-NEXT:    vmov q0, q5
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r4
 ; CHECK-NEXT:    vmov q1, q6
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 2
@@ -4205,385 +3925,272 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    .pad #56
+; CHECK-NEXT:    sub sp, #56
 ; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    mov r8, r0
-; CHECK-NEXT:    vcvtb.f32.f16 s30, s19
-; CHECK-NEXT:    vcvtb.f32.f16 s28, s18
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    vcvtt.f32.f16 s22, s19
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s16
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s17
-; CHECK-NEXT:    vldr s20, .LCPI50_1
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov r7, s28
-; CHECK-NEXT:    vcvtt.f32.f16 s18, s18
-; CHECK-NEXT:    vmov r9, s24
-; CHECK-NEXT:    vmov r6, s26
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    vcvtt.f32.f16 s22, s16
+; CHECK-NEXT:    vmov r0, s22
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s17
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vldr s20, .LCPI50_0
+; CHECK-NEXT:    vcmp.f32 s22, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r3, #15
+; CHECK-NEXT:    str r3, [sp, #52] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    str.w r2, [r8, #83]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
+; CHECK-NEXT:    str r2, [sp, #48] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r8, #79]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    str r1, [sp, #44] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r8, #75]
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    str r7, [sp, #40] @ 4-byte Spill
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    mov r7, r3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    vcmp.f32 s24, #0
+; CHECK-NEXT:    vcvtt.f32.f16 s22, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str.w r2, [r8, #58]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r8, #54]
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r8, #50]
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    str r3, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    vcmp.f32 s24, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str.w r2, [r8, #33]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    movgt r3, #15
+; CHECK-NEXT:    str r3, [sp, #36] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r8, #29]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    str r2, [sp, #32] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r8, #25]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    vmov r0, s22
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    vcmp.f32 s22, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
 ; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, #0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r3, #15
+; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    str.w r2, [r8, #8]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, #0
+; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r1, [r8, #4]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str.w r0, [r8]
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    vcvtb.f32.f16 s22, s17
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vmov r0, s22
 ; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s18
+; CHECK-NEXT:    mov r9, r0
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r6, r2
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt.w r8, #0
+; CHECK-NEXT:    movlt.w r9, #0
+; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    vcmp.f32 s22, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r3, #15
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movgt.w r9, #-1
+; CHECK-NEXT:    movgt.w r8, #-1
 ; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r5, #15
-; CHECK-NEXT:    and r0, r5, #15
-; CHECK-NEXT:    mov r9, r1
-; CHECK-NEXT:    orr.w r1, r0, r6, lsl #4
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    str.w r1, [r8, #87]
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    mov r10, r0
+; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r10, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r10, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcvtb.f32.f16 s18, s19
+; CHECK-NEXT:    mov r11, r1
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    mov r7, r2
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt r7, #0
+; CHECK-NEXT:    movlt.w r11, #0
+; CHECK-NEXT:    movlt r5, #0
+; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    and r0, r7, #15
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s17
-; CHECK-NEXT:    orr.w r0, r0, r10, lsl #4
-; CHECK-NEXT:    str.w r0, [r8, #62]
-; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    movgt r3, #15
+; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    movgt.w r11, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r3, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s18, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt.w r10, #0
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    mov r11, r1
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
-; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    str.w r2, [r4, #83]
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r0, #15
-; CHECK-NEXT:    and r0, r0, #15
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    orr.w r0, r0, r1, lsl #4
-; CHECK-NEXT:    str.w r0, [r8, #37]
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    str.w r1, [r4, #79]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    and r7, r7, #15
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    orr.w r7, r7, r0, lsl #4
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str.w r7, [r8, #12]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r9, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r9, #-1
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    lsrl r6, r9, #28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r4, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r4, #-1
-; CHECK-NEXT:    orr.w r7, r9, r4, lsl #4
-; CHECK-NEXT:    str.w r7, [r8, #95]
-; CHECK-NEXT:    str.w r6, [r8, #91]
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    vcvtt.f32.f16 s18, s19
+; CHECK-NEXT:    str.w r0, [r4, #75]
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    str.w r7, [r4, #58]
+; CHECK-NEXT:    str.w r11, [r4, #54]
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    str.w r5, [r4, #50]
+; CHECK-NEXT:    str.w r6, [r4, #33]
+; CHECK-NEXT:    str.w r8, [r4, #29]
+; CHECK-NEXT:    str.w r9, [r4, #25]
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    and r7, r7, #15
+; CHECK-NEXT:    movgt.w r10, #15
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s16
+; CHECK-NEXT:    mov r5, r0
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    mov r7, r1
 ; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    lsrl r4, r7, #28
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    mov r8, r3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r4, [r8, #99]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r7, #0
 ; CHECK-NEXT:    movlt r5, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r5, #-1
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r10, r5, #28
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt.w r8, #0
 ; CHECK-NEXT:    vcmp.f32 s18, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    orr.w r7, r5, r6, lsl #4
-; CHECK-NEXT:    str.w r7, [r8, #70]
-; CHECK-NEXT:    str.w r10, [r8, #66]
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    ldr r7, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    and r5, r7, #15
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    lsrl r6, r5, #28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r6, [r8, #74]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt.w r11, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r11, #-1
-; CHECK-NEXT:    ldr r4, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    lsrl r4, r11, #28
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r6, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    b.w .LBB50_2
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI50_1:
-; CHECK-NEXT:    .long 0x717fffff @ float 1.26765052E+30
-; CHECK-NEXT:    .p2align 1
-; CHECK-NEXT:  .LBB50_2:
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itttt gt
+; CHECK-NEXT:    movgt.w r8, #15
 ; CHECK-NEXT:    movgt.w r6, #-1
-; CHECK-NEXT:    orr.w r7, r11, r6, lsl #4
-; CHECK-NEXT:    str.w r7, [r8, #45]
-; CHECK-NEXT:    str.w r4, [r8, #41]
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    ldr r7, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r7, #15
-; CHECK-NEXT:    and r5, r7, #15
+; CHECK-NEXT:    movgt.w r5, #-1
+; CHECK-NEXT:    movgt.w r7, #-1
+; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    lsrl r6, r5, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    strb.w r6, [r8, #49]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    movlt r0, #0
 ; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    lsrl r0, r1, #28
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    vcmp.f32 s16, s20
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    strd r0, r1, [r8, #16]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
+; CHECK-NEXT:    str r2, [r4, #8]
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    str r1, [r4, #4]
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    str r0, [r4]
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    lsrl r0, r7, #28
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    orr.w r1, r7, r6, lsl #4
+; CHECK-NEXT:    str.w r1, [r4, #95]
+; CHECK-NEXT:    and r1, r8, #15
+; CHECK-NEXT:    str.w r0, [r4, #91]
+; CHECK-NEXT:    and r0, r10, #15
+; CHECK-NEXT:    lsrl r6, r1, #28
+; CHECK-NEXT:    strb.w r6, [r4, #99]
+; CHECK-NEXT:    orr.w r0, r0, r5, lsl #4
+; CHECK-NEXT:    str.w r0, [r4, #87]
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    str.w r1, [r4, #70]
+; CHECK-NEXT:    str.w r0, [r4, #66]
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r4, #74]
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
+; CHECK-NEXT:    str.w r0, [r4, #62]
+; CHECK-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    str.w r1, [r4, #45]
+; CHECK-NEXT:    str.w r0, [r4, #41]
+; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb.w r2, [r4, #49]
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    and r0, r0, #15
+; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
+; CHECK-NEXT:    str.w r0, [r4, #37]
+; CHECK-NEXT:    ldr r7, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #48] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    lsrl r0, r1, #28
+; CHECK-NEXT:    orr.w r1, r1, r2, lsl #4
+; CHECK-NEXT:    strd r0, r1, [r4, #16]
+; CHECK-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
+; CHECK-NEXT:    and r1, r0, #15
+; CHECK-NEXT:    lsrl r2, r1, #28
+; CHECK-NEXT:    strb r2, [r4, #24]
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt r3, #15
-; CHECK-NEXT:    and r1, r3, #15
-; CHECK-NEXT:    lsrl r2, r1, #28
-; CHECK-NEXT:    strb.w r2, [r8, #24]
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    and r0, r3, #15
+; CHECK-NEXT:    orr.w r0, r0, r7, lsl #4
+; CHECK-NEXT:    str r0, [r4, #12]
+; CHECK-NEXT:    add sp, #56
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI50_0:
+; CHECK-NEXT:    .long 0x717fffff @ float 1.26765052E+30
     %x = call <8 x i100> @llvm.fptoui.sat.v8f16.v8i100(<8 x half> %f)
     ret <8 x i100> %x
 }
@@ -4591,337 +4198,195 @@ define arm_aapcs_vfpcc <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 define arm_aapcs_vfpcc <8 x i128> @test_unsigned_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-LABEL: test_unsigned_v8f16_v8i128:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    vcvtt.f32.f16 s26, s19
-; CHECK-NEXT:    vcvtb.f32.f16 s22, s16
-; CHECK-NEXT:    vmov r0, s26
-; CHECK-NEXT:    vcvtt.f32.f16 s16, s16
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s17
-; CHECK-NEXT:    vcvtb.f32.f16 s30, s19
-; CHECK-NEXT:    vldr s20, .LCPI51_0
-; CHECK-NEXT:    vmov r8, s22
-; CHECK-NEXT:    vmov r9, s16
-; CHECK-NEXT:    vcvtt.f32.f16 s28, s18
-; CHECK-NEXT:    vmov r7, s24
-; CHECK-NEXT:    vmov r6, s30
+; CHECK-NEXT:    vcvtt.f32.f16 s22, s19
+; CHECK-NEXT:    vmov r0, s22
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    vcvtb.f32.f16 s18, s18
+; CHECK-NEXT:    vcvtb.f32.f16 s28, s19
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    vmov r0, s28
+; CHECK-NEXT:    vldr s20, .LCPI51_0
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s18
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s18
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r6, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r3, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str r3, [r4, #124]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str r2, [r4, #120]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #116]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #112]
-; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    vmov r5, s28
+; CHECK-NEXT:    movgt.w r6, #-1
+; CHECK-NEXT:    strd r6, r1, [r4, #112]
+; CHECK-NEXT:    vmov r7, s24
+; CHECK-NEXT:    vmov r5, s26
+; CHECK-NEXT:    vcvtt.f32.f16 s18, s17
+; CHECK-NEXT:    strd r2, r3, [r4, #120]
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s26, s17
+; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    add.w r12, r4, #96
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s28, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    str r3, [r4, #108]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, s20
-; CHECK-NEXT:    str r2, [r4, #104]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s30, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt.w r1, #-1
+; CHECK-NEXT:    movgt.w r0, #-1
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    vmov r6, s18
+; CHECK-NEXT:    vcvtb.f32.f16 s22, s17
+; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    vcmp.f32 s24, #0
+; CHECK-NEXT:    add.w r12, r4, #80
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #100]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s24, s20
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s30, s20
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
+; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt.w r3, #-1
+; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEXT:    ittt gt
+; CHECK-NEXT:    movgt.w r2, #-1
+; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #96]
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
 ; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:    vmov r6, s18
+; CHECK-NEXT:    vcvtt.f32.f16 s24, s16
+; CHECK-NEXT:    vmov r7, s22
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s28, #0
+; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    add.w r12, r4, #64
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s26, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str r3, [r4, #92]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    str r2, [r4, #88]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s28, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #84]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s28, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #80]
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
 ; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    vmov r5, s26
+; CHECK-NEXT:    vmov r5, s24
+; CHECK-NEXT:    vcvtb.f32.f16 s16, s16
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s18, #0
+; CHECK-NEXT:    add.w r12, r4, #48
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str r3, [r4, #76]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    str r2, [r4, #72]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s18, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    itt gt
 ; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #68]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s18, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #64]
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    vmov r6, s16
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s26, #0
+; CHECK-NEXT:    vcmp.f32 s22, #0
+; CHECK-NEXT:    add.w r12, r4, #32
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    vcmp.f32 s22, s20
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str r3, [r4, #60]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    str r2, [r4, #56]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s26, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #52]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s26, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #48]
-; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s24, #0
+; CHECK-NEXT:    add.w r12, r4, #16
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
+; CHECK-NEXT:    movlt r0, #0
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    str r3, [r4, #44]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    str r2, [r4, #40]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s24, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #36]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s24, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #32]
-; CHECK-NEXT:    mov r0, r9
+; CHECK-NEXT:    stm.w r12, {r0, r1, r2, r3}
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    vcmp.f32 s16, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r3, [r4, #28]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    str r2, [r4, #24]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s16, #0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r1, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #20]
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    itttt lt
 ; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s16, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4, #16]
-; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #0
+; CHECK-NEXT:    movlt r2, #0
 ; CHECK-NEXT:    movlt r3, #0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    it gt
+; CHECK-NEXT:    ittt gt
 ; CHECK-NEXT:    movgt.w r3, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r3, [r4, #12]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r2, #-1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    str r2, [r4, #8]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #0
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f32 s22, #0
-; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r1, #-1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    str r1, [r4, #4]
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, #0
-; CHECK-NEXT:    vcmp.f32 s22, s20
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    movgt.w r0, #-1
-; CHECK-NEXT:    str r0, [r4]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    stm r4!, {r0, r1, r2, r3}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI51_0:
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
index 117469f3bd788..101b49fea488a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll
@@ -300,27 +300,27 @@ define arm_aapcs_vfpcc <8 x i16> @zext_v8i1_v8f32(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    vcmp.f16 s10, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s10, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
 ; CHECK-MVE-NEXT:    csetm r12, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s10, s8
 ; CHECK-MVE-NEXT:    csetm lr, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f16 s10, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    csetm r2, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    csetm r3, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f16 s6, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    csetm r0, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s2
 ; CHECK-MVE-NEXT:    csetm r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
@@ -329,12 +329,12 @@ define arm_aapcs_vfpcc <8 x i16> @zext_v8i1_v8f32(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    csetm r5, ne
 ; CHECK-MVE-NEXT:    vmov.16 q1[0], r5
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r1
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r4
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r3
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
-; CHECK-MVE-NEXT:    vmov.16 q1[5], lr
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r2
+; CHECK-MVE-NEXT:    vmov.16 q1[1], r4
+; CHECK-MVE-NEXT:    vmov.16 q1[2], r1
+; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
+; CHECK-MVE-NEXT:    vmov.16 q1[4], r3
+; CHECK-MVE-NEXT:    vmov.16 q1[5], r2
+; CHECK-MVE-NEXT:    vmov.16 q1[6], lr
 ; CHECK-MVE-NEXT:    vmov.16 q1[7], r12
 ; CHECK-MVE-NEXT:    vand q0, q1, q0
 ; CHECK-MVE-NEXT:    pop {r4, r5, r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
index baf0076277e50..6f2539e3cad9a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
@@ -43,10 +43,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_one_v4f32(<4 x float> %src, <4 x float>
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s1, s5
-; CHECK-MVE-NEXT:    cset r0, mi
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
+; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, le
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, s7
@@ -230,10 +228,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ueq_v4f32(<4 x float> %src, <4 x float>
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s1, s5
-; CHECK-MVE-NEXT:    cset r0, eq
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
+; CHECK-MVE-NEXT:    cset r0, eq
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, vc
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, s7
@@ -271,17 +267,17 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @vcmp_une_v4f32(<4 x float> %src, <4 x float> %src2, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s3, s7
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s2, s6
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s5
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
+; CHECK-MVE-NEXT:    vcmp.f32 s3, s7
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -1090,53 +1086,53 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s16, s12
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
 ; CHECK-MVE-NEXT:    vins.f16 s1, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s5, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
 ; CHECK-MVE-NEXT:    vins.f16 s2, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
 ; CHECK-MVE-NEXT:    vins.f16 s3, s4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_une_v8f16:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
index fe82255bff6c8..d42c393743f4f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
@@ -46,10 +46,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_one_v4f32(<4 x float> %src, float %src2
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s1, s4
-; CHECK-MVE-NEXT:    cset r0, mi
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
+; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, le
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, s4
@@ -248,10 +246,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ueq_v4f32(<4 x float> %src, float %src2
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s1, s4
-; CHECK-MVE-NEXT:    cset r0, eq
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
+; CHECK-MVE-NEXT:    cset r0, eq
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, vc
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, s4
@@ -292,17 +288,17 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @vcmp_une_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s3, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s2, s4
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, s4
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
+; CHECK-MVE-NEXT:    vcmp.f32 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -1105,42 +1101,42 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s7, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s7, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
 ; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
 ; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s10, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
 ; CHECK-MVE-NEXT:    vins.f16 s3, s6
@@ -1659,10 +1655,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_r_one_v4f32(<4 x float> %src, float %sr
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
-; CHECK-MVE-NEXT:    cset r0, mi
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, le
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
@@ -1861,10 +1855,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_r_ueq_v4f32(<4 x float> %src, float %sr
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
-; CHECK-MVE-NEXT:    cset r0, eq
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    cset r0, eq
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, vc
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
@@ -1905,17 +1897,17 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @vcmp_r_une_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -2718,42 +2710,42 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s7, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s7, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
 ; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
 ; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s10, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
 ; CHECK-MVE-NEXT:    vins.f16 s3, s6
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
index 16689f1e7ecd1..718657839d38d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
@@ -43,10 +43,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_one_v4f32(<4 x float> %src, <4 x float>
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
-; CHECK-MVE-NEXT:    cset r0, mi
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, le
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
@@ -230,10 +228,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ueq_v4f32(<4 x float> %src, <4 x float>
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
-; CHECK-MVE-NEXT:    cset r0, eq
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    cset r0, eq
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, vc
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
@@ -271,17 +267,17 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @vcmp_une_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -1042,42 +1038,42 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s14
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
 ; CHECK-MVE-NEXT:    vins.f16 s1, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
 ; CHECK-MVE-NEXT:    vins.f16 s2, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
 ; CHECK-MVE-NEXT:    vins.f16 s3, s4
@@ -1572,10 +1568,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_r_one_v4f32(<4 x float> %src, <4 x floa
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
-; CHECK-MVE-NEXT:    cset r0, mi
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    cset r0, mi
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, le
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
@@ -1759,10 +1753,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_r_ueq_v4f32(<4 x float> %src, <4 x floa
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
-; CHECK-MVE-NEXT:    cset r0, eq
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    cset r0, eq
 ; CHECK-MVE-NEXT:    csinc r0, r0, zr, vc
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
@@ -1800,17 +1792,17 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @vcmp_r_une_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_une_v4f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -2571,42 +2563,42 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s14
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
 ; CHECK-MVE-NEXT:    vins.f16 s1, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
 ; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
 ; CHECK-MVE-NEXT:    vins.f16 s2, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
 ; CHECK-MVE-NEXT:    vins.f16 s3, s4
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-extract-vec256.mir b/llvm/test/CodeGen/X86/GlobalISel/select-extract-vec256.mir
index 301d63b7f3643..fda9f8a010f3d 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-extract-vec256.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-extract-vec256.mir
@@ -65,7 +65,7 @@ registers:
 # AVX-NEXT:          RET 0, implicit $xmm0
 #
 # AVX512VL:          %0:vr256x = COPY $ymm1
-# AVX512VL-NEXT:     %1:vr128x = VEXTRACTF32x4Z256rri %0, 1
+# AVX512VL-NEXT:     %1:vr128x = VEXTRACTF32X4Z256rri %0, 1
 # AVX512VL-NEXT:     $xmm0 = COPY %1
 # AVX512VL-NEXT:     RET 0, implicit $xmm0
 body:             |
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-extract-vec512.mir b/llvm/test/CodeGen/X86/GlobalISel/select-extract-vec512.mir
index cff8560a4ba45..3f199448e89a6 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-extract-vec512.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-extract-vec512.mir
@@ -59,7 +59,7 @@ registers:
   - { id: 0, class: vecr }
   - { id: 1, class: vecr }
 # ALL:          %0:vr512 = COPY $zmm1
-# ALL-NEXT:     %1:vr128x = VEXTRACTF32x4Zrri %0, 1
+# ALL-NEXT:     %1:vr128x = VEXTRACTF32X4Zrri %0, 1
 # ALL-NEXT:     $xmm0 = COPY %1
 # ALL-NEXT:     RET 0, implicit $xmm0
 body:             |
@@ -111,7 +111,7 @@ registers:
   - { id: 0, class: vecr }
   - { id: 1, class: vecr }
 # ALL:          %0:vr512 = COPY $zmm1
-# ALL-NEXT:     %1:vr256x = VEXTRACTF64x4Zrri %0, 1
+# ALL-NEXT:     %1:vr256x = VEXTRACTF64X4Zrri %0, 1
 # ALL-NEXT:     $ymm0 = COPY %1
 # ALL-NEXT:     RET 0, implicit $ymm0
 body:             |
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-insert-vec256.mir b/llvm/test/CodeGen/X86/GlobalISel/select-insert-vec256.mir
index f04917c747979..3368ed699a1f8 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-insert-vec256.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-insert-vec256.mir
@@ -36,7 +36,7 @@ registers:
 #
 # AVX512VL:          %0:vr256x = COPY $ymm0
 # AVX512VL-NEXT:     %1:vr128x = COPY $xmm1
-# AVX512VL-NEXT:     %2:vr256x = VINSERTF32x4Z256rri %0, %1, 0
+# AVX512VL-NEXT:     %2:vr256x = VINSERTF32X4Z256rri %0, %1, 0
 # AVX512VL-NEXT:     $ymm0 = COPY %2
 # AVX512VL-NEXT:     RET 0, implicit $ymm0
 body:             |
@@ -98,7 +98,7 @@ registers:
 #
 # AVX512VL:          %0:vr256x = COPY $ymm0
 # AVX512VL-NEXT:     %1:vr128x = COPY $xmm1
-# AVX512VL-NEXT:     %2:vr256x = VINSERTF32x4Z256rri %0, %1, 1
+# AVX512VL-NEXT:     %2:vr256x = VINSERTF32X4Z256rri %0, %1, 1
 # AVX512VL-NEXT:     $ymm0 = COPY %2
 # AVX512VL-NEXT:     RET 0, implicit $ymm0
 body:             |
@@ -129,7 +129,7 @@ registers:
 #
 # AVX512VL:          %0:vr256x = IMPLICIT_DEF
 # AVX512VL-NEXT:     %1:vr128x = COPY $xmm1
-# AVX512VL-NEXT:     %2:vr256x = VINSERTF32x4Z256rri %0, %1, 1
+# AVX512VL-NEXT:     %2:vr256x = VINSERTF32X4Z256rri %0, %1, 1
 # AVX512VL-NEXT:     $ymm0 = COPY %2
 # AVX512VL-NEXT:     RET 0, implicit $ymm0
 body:             |
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-insert-vec512.mir b/llvm/test/CodeGen/X86/GlobalISel/select-insert-vec512.mir
index 10d98d7a3111b..6fb59df0736da 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-insert-vec512.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-insert-vec512.mir
@@ -51,8 +51,8 @@ body:             |
     ; ALL-LABEL: name: test_insert_128_idx0
     ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY $zmm0
     ; ALL: [[COPY1:%[0-9]+]]:vr128x = COPY $xmm1
-    ; ALL: [[VINSERTF32x4Zrri:%[0-9]+]]:vr512 = VINSERTF32x4Zrri [[COPY]], [[COPY1]], 0
-    ; ALL: $zmm0 = COPY [[VINSERTF32x4Zrri]]
+    ; ALL: [[VINSERTF32X4Zrri:%[0-9]+]]:vr512 = VINSERTF32X4Zrri [[COPY]], [[COPY1]], 0
+    ; ALL: $zmm0 = COPY [[VINSERTF32X4Zrri]]
     ; ALL: RET 0, implicit $ymm0
     %0(<16 x s32>) = COPY $zmm0
     %1(<4 x s32>) = COPY $xmm1
@@ -102,8 +102,8 @@ body:             |
     ; ALL-LABEL: name: test_insert_128_idx1
     ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY $zmm0
     ; ALL: [[COPY1:%[0-9]+]]:vr128x = COPY $xmm1
-    ; ALL: [[VINSERTF32x4Zrri:%[0-9]+]]:vr512 = VINSERTF32x4Zrri [[COPY]], [[COPY1]], 1
-    ; ALL: $zmm0 = COPY [[VINSERTF32x4Zrri]]
+    ; ALL: [[VINSERTF32X4Zrri:%[0-9]+]]:vr512 = VINSERTF32X4Zrri [[COPY]], [[COPY1]], 1
+    ; ALL: $zmm0 = COPY [[VINSERTF32X4Zrri]]
     ; ALL: RET 0, implicit $ymm0
     %0(<16 x s32>) = COPY $zmm0
     %1(<4 x s32>) = COPY $xmm1
@@ -127,8 +127,8 @@ body:             |
     ; ALL-LABEL: name: test_insert_128_idx1_undef
     ; ALL: [[DEF:%[0-9]+]]:vr512 = IMPLICIT_DEF
     ; ALL: [[COPY:%[0-9]+]]:vr128x = COPY $xmm1
-    ; ALL: [[VINSERTF32x4Zrri:%[0-9]+]]:vr512 = VINSERTF32x4Zrri [[DEF]], [[COPY]], 1
-    ; ALL: $zmm0 = COPY [[VINSERTF32x4Zrri]]
+    ; ALL: [[VINSERTF32X4Zrri:%[0-9]+]]:vr512 = VINSERTF32X4Zrri [[DEF]], [[COPY]], 1
+    ; ALL: $zmm0 = COPY [[VINSERTF32X4Zrri]]
     ; ALL: RET 0, implicit $ymm0
     %0(<16 x s32>) = IMPLICIT_DEF
     %1(<4 x s32>) = COPY $xmm1
@@ -152,8 +152,8 @@ body:             |
     ; ALL-LABEL: name: test_insert_256_idx0
     ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY $zmm0
     ; ALL: [[COPY1:%[0-9]+]]:vr256x = COPY $ymm1
-    ; ALL: [[VINSERTF64x4Zrri:%[0-9]+]]:vr512 = VINSERTF64x4Zrri [[COPY]], [[COPY1]], 0
-    ; ALL: $zmm0 = COPY [[VINSERTF64x4Zrri]]
+    ; ALL: [[VINSERTF64X4Zrri:%[0-9]+]]:vr512 = VINSERTF64X4Zrri [[COPY]], [[COPY1]], 0
+    ; ALL: $zmm0 = COPY [[VINSERTF64X4Zrri]]
     ; ALL: RET 0, implicit $ymm0
     %0(<16 x s32>) = COPY $zmm0
     %1(<8 x s32>) = COPY $ymm1
@@ -203,8 +203,8 @@ body:             |
     ; ALL-LABEL: name: test_insert_256_idx1
     ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY $zmm0
     ; ALL: [[COPY1:%[0-9]+]]:vr256x = COPY $ymm1
-    ; ALL: [[VINSERTF64x4Zrri:%[0-9]+]]:vr512 = VINSERTF64x4Zrri [[COPY]], [[COPY1]], 1
-    ; ALL: $zmm0 = COPY [[VINSERTF64x4Zrri]]
+    ; ALL: [[VINSERTF64X4Zrri:%[0-9]+]]:vr512 = VINSERTF64X4Zrri [[COPY]], [[COPY1]], 1
+    ; ALL: $zmm0 = COPY [[VINSERTF64X4Zrri]]
     ; ALL: RET 0, implicit $ymm0
     %0(<16 x s32>) = COPY $zmm0
     %1(<8 x s32>) = COPY $ymm1
@@ -228,8 +228,8 @@ body:             |
     ; ALL-LABEL: name: test_insert_256_idx1_undef
     ; ALL: [[DEF:%[0-9]+]]:vr512 = IMPLICIT_DEF
     ; ALL: [[COPY:%[0-9]+]]:vr256x = COPY $ymm1
-    ; ALL: [[VINSERTF64x4Zrri:%[0-9]+]]:vr512 = VINSERTF64x4Zrri [[DEF]], [[COPY]], 1
-    ; ALL: $zmm0 = COPY [[VINSERTF64x4Zrri]]
+    ; ALL: [[VINSERTF64X4Zrri:%[0-9]+]]:vr512 = VINSERTF64X4Zrri [[DEF]], [[COPY]], 1
+    ; ALL: $zmm0 = COPY [[VINSERTF64X4Zrri]]
     ; ALL: RET 0, implicit $ymm0
     %0(<16 x s32>) = IMPLICIT_DEF
     %1(<8 x s32>) = COPY $ymm1
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir b/llvm/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
index 9d6494d628bf0..83ce6eb0b17be 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
@@ -29,8 +29,8 @@ body:             |
     ; AVX512VL-LABEL: name: test_merge
     ; AVX512VL: [[DEF:%[0-9]+]]:vr128x = IMPLICIT_DEF
     ; AVX512VL: undef %2.sub_xmm:vr256x = COPY [[DEF]]
-    ; AVX512VL: [[VINSERTF32x4Z256rri:%[0-9]+]]:vr256x = VINSERTF32x4Z256rri %2, [[DEF]], 1
-    ; AVX512VL: $ymm0 = COPY [[VINSERTF32x4Z256rri]]
+    ; AVX512VL: [[VINSERTF32X4Z256rri:%[0-9]+]]:vr256x = VINSERTF32X4Z256rri %2, [[DEF]], 1
+    ; AVX512VL: $ymm0 = COPY [[VINSERTF32X4Z256rri]]
     ; AVX512VL: RET 0, implicit $ymm0
     %0(<4 x s32>) = IMPLICIT_DEF
     %1(<8 x s32>) = G_CONCAT_VECTORS %0(<4 x s32>), %0(<4 x s32>)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir b/llvm/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
index 22045d3bb8cbb..d8e3c3aea262b 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
@@ -24,10 +24,10 @@ body:             |
     ; ALL-LABEL: name: test_merge_v128
     ; ALL: [[DEF:%[0-9]+]]:vr128x = IMPLICIT_DEF
     ; ALL: undef %2.sub_xmm:vr512 = COPY [[DEF]]
-    ; ALL: [[VINSERTF32x4Zrri:%[0-9]+]]:vr512 = VINSERTF32x4Zrri %2, [[DEF]], 1
-    ; ALL: [[VINSERTF32x4Zrri1:%[0-9]+]]:vr512 = VINSERTF32x4Zrri [[VINSERTF32x4Zrri]], [[DEF]], 2
-    ; ALL: [[VINSERTF32x4Zrri2:%[0-9]+]]:vr512 = VINSERTF32x4Zrri [[VINSERTF32x4Zrri1]], [[DEF]], 3
-    ; ALL: $zmm0 = COPY [[VINSERTF32x4Zrri2]]
+    ; ALL: [[VINSERTF32X4Zrri:%[0-9]+]]:vr512 = VINSERTF32X4Zrri %2, [[DEF]], 1
+    ; ALL: [[VINSERTF32X4Zrri1:%[0-9]+]]:vr512 = VINSERTF32X4Zrri [[VINSERTF32X4Zrri]], [[DEF]], 2
+    ; ALL: [[VINSERTF32X4Zrri2:%[0-9]+]]:vr512 = VINSERTF32X4Zrri [[VINSERTF32X4Zrri1]], [[DEF]], 3
+    ; ALL: $zmm0 = COPY [[VINSERTF32X4Zrri2]]
     ; ALL: RET 0, implicit $zmm0
     %0(<4 x s32>) = IMPLICIT_DEF
     %1(<16 x s32>) = G_CONCAT_VECTORS %0(<4 x s32>), %0(<4 x s32>), %0(<4 x s32>), %0(<4 x s32>)
@@ -49,8 +49,8 @@ body:             |
     ; ALL-LABEL: name: test_merge_v256
     ; ALL: [[DEF:%[0-9]+]]:vr256x = IMPLICIT_DEF
     ; ALL: undef %2.sub_ymm:vr512 = COPY [[DEF]]
-    ; ALL: [[VINSERTF64x4Zrri:%[0-9]+]]:vr512 = VINSERTF64x4Zrri %2, [[DEF]], 1
-    ; ALL: $zmm0 = COPY [[VINSERTF64x4Zrri]]
+    ; ALL: [[VINSERTF64X4Zrri:%[0-9]+]]:vr512 = VINSERTF64X4Zrri %2, [[DEF]], 1
+    ; ALL: $zmm0 = COPY [[VINSERTF64X4Zrri]]
     ; ALL: RET 0, implicit $zmm0
     %0(<8 x s32>) = IMPLICIT_DEF
     %1(<16 x s32>) = G_CONCAT_VECTORS %0(<8 x s32>), %0(<8 x s32>)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir b/llvm/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
index 5ed1463f873a9..920d66c3fc0b3 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
@@ -33,9 +33,9 @@ body:             |
     ; AVX512VL-LABEL: name: test_unmerge
     ; AVX512VL: [[DEF:%[0-9]+]]:vr256x = IMPLICIT_DEF
     ; AVX512VL-NEXT: [[COPY:%[0-9]+]]:vr128x = COPY [[DEF]].sub_xmm
-    ; AVX512VL-NEXT: [[VEXTRACTF32x4Z256rri:%[0-9]+]]:vr128x = VEXTRACTF32x4Z256rri [[DEF]], 1
+    ; AVX512VL-NEXT: [[VEXTRACTF32X4Z256rri:%[0-9]+]]:vr128x = VEXTRACTF32X4Z256rri [[DEF]], 1
     ; AVX512VL-NEXT: $xmm0 = COPY [[COPY]]
-    ; AVX512VL-NEXT: $xmm1 = COPY [[VEXTRACTF32x4Z256rri]]
+    ; AVX512VL-NEXT: $xmm1 = COPY [[VEXTRACTF32X4Z256rri]]
     ; AVX512VL-NEXT: RET 0, implicit $xmm0, implicit $xmm1
     %0(<8 x s32>) = IMPLICIT_DEF
     %1(<4 x s32>), %2(<4 x s32>) = G_UNMERGE_VALUES %0(<8 x s32>)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir b/llvm/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
index 8864d5bb47488..785cf79ca1db9 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
@@ -27,9 +27,9 @@ body:             |
     ; ALL-LABEL: name: test_unmerge_v128
     ; ALL: [[DEF:%[0-9]+]]:vr512 = IMPLICIT_DEF
     ; ALL: [[COPY:%[0-9]+]]:vr128x = COPY [[DEF]].sub_xmm
-    ; ALL: [[VEXTRACTF32x4Zrri:%[0-9]+]]:vr128x = VEXTRACTF32x4Zrri [[DEF]], 1
-    ; ALL: [[VEXTRACTF32x4Zrri1:%[0-9]+]]:vr128x = VEXTRACTF32x4Zrri [[DEF]], 2
-    ; ALL: [[VEXTRACTF32x4Zrri2:%[0-9]+]]:vr128x = VEXTRACTF32x4Zrri [[DEF]], 3
+    ; ALL: [[VEXTRACTF32X4Zrri:%[0-9]+]]:vr128x = VEXTRACTF32X4Zrri [[DEF]], 1
+    ; ALL: [[VEXTRACTF32X4Zrri1:%[0-9]+]]:vr128x = VEXTRACTF32X4Zrri [[DEF]], 2
+    ; ALL: [[VEXTRACTF32X4Zrri2:%[0-9]+]]:vr128x = VEXTRACTF32X4Zrri [[DEF]], 3
     ; ALL: $xmm0 = COPY [[COPY]]
     ; ALL: RET 0, implicit $xmm0
     %0(<16 x s32>) = IMPLICIT_DEF
@@ -53,7 +53,7 @@ body:             |
     ; ALL-LABEL: name: test_unmerge_v256
     ; ALL: [[DEF:%[0-9]+]]:vr512 = IMPLICIT_DEF
     ; ALL: [[COPY:%[0-9]+]]:vr256x = COPY [[DEF]].sub_ymm
-    ; ALL: [[VEXTRACTF64x4Zrri:%[0-9]+]]:vr256x = VEXTRACTF64x4Zrri [[DEF]], 1
+    ; ALL: [[VEXTRACTF64X4Zrri:%[0-9]+]]:vr256x = VEXTRACTF64X4Zrri [[DEF]], 1
     ; ALL: $ymm0 = COPY [[COPY]]
     ; ALL: RET 0, implicit $ymm0
     %0(<16 x s32>) = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 4972d3e4ec72b..cad1d09f11d9c 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -4756,7 +4756,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
 ; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/pr114265.mir b/llvm/test/CodeGen/X86/pr114265.mir
deleted file mode 100644
index b6e724b4bd128..0000000000000
--- a/llvm/test/CodeGen/X86/pr114265.mir
+++ /dev/null
@@ -1,94 +0,0 @@
-# The change being tested here is that X86InstrInfo's getSPAdjust correctly handles POP/ADD instructions within
-# call sequences, as previously it assumed only PUSHes would be present for parameter passing.
-# What this test actually does is recreate a situation where:
-#  - something other than a PUSH appears in a call sequence, and
-#  - failing to recognize the SP adjustment by such an instruction actually changes something
-#    observable.
-#
-# To this end, we create a situation where:
-#  - the FP must be spilled around calls
-#  - a frame object is stored before a call frame and loaded in the call frame 
-#    (emulating an argument restored from spill), following a call which POPs something
-#  - call-frame pseudos can *not* be simplified early in prologepilog
-#
-# The issue being corrected is the case where prologepilog sees the SP adjustment of PUSHes only, and not
-# POP/ADD. This adjustment value can be carried over and incorrectly applied to frame offsets. So,
-# in the following we ensure that references to a frame object carry the same offset.
-#
-# NB:
-#  FPClobberedByCall and hasPushSequence have to be supplied in the MFI section. The former
-#  is required to force spill of the FP, and the latter ensures call-frame pseudos are not simplified.
-#
-#  The csr_64_intel_ocl_bi_avx512 regmask is used to ensure that the FP is spilled. Other csr's may
-#  acheive the same.
-#
-# RUN: llc -mtriple x86_64-unknown-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s 
----
-name:            f
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   true
-failsVerification: false
-tracksDebugUserValues: true
-registers:       []
-liveins:
-  - { reg: '$rdi', virtual-reg: '' }
-  - { reg: '$rsi', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    64
-  adjustsStack:    true
-  hasCalls:        true
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:
-  - { id: 0, name: '', type: spill-slot, offset: 0, size: 64, 
-      alignment: 32, stack-id: default, callee-saved-register: '', callee-saved-restored: true, 
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-machineFunctionInfo:
-  FPClobberedByCall:  true
-  hasPushSequences: true
-body:             |
-  bb.0:
-    liveins: $rdi, $rsi
-    MOV64mr %stack.0, 1, $noreg, 0, $noreg, renamable $rdi :: (store (s64) into %stack.0)
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    CALL64r renamable undef $rsi, csr_64_intel_ocl_bi_avx512, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-    $rax = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
-    $rdi = COPY renamable $rax
-    CALL64r renamable undef $rsi, csr_64_intel_ocl_bi_avx512, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp
-    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-...
-# ensure the store and load to the frame object have matching offsets after resolution.
-# CHECK: MOV64mr $rsp, 1, $noreg, [[DISP:[1-9][0-9]+]]
-# CHECK: MOV64rm $rsp, 1, $noreg, [[DISP]]
diff --git a/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll b/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll
index 4fb061e1bb764..e8359cb088dc3 100644
--- a/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -24,9 +24,9 @@ define i32 @test_basic(i32 %l) #0 {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl %esp, %eax
 ; X86-NEXT:    leal 15(,%esi,4), %ecx
 ; X86-NEXT:    andl $-16, %ecx
-; X86-NEXT:    movl %esp, %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    cmpl %eax, %gs:48
 ; X86-NEXT:    jg .LBB0_4
@@ -39,17 +39,17 @@ define i32 @test_basic(i32 %l) #0 {
 ; X86-NEXT:    calll __morestack_allocate_stack_space
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:  .LBB0_5:
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll dummy_use@PLT
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    je .LBB0_6
 ; X86-NEXT:  # %bb.8: # %false
 ; X86-NEXT:    decl %esi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl %esi, (%esp)
 ; X86-NEXT:    calll test_basic@PLT
 ; X86-NEXT:    jmp .LBB0_7
 ; X86-NEXT:  .LBB0_6: # %true
@@ -83,10 +83,10 @@ define i32 @test_basic(i32 %l) #0 {
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    .cfi_offset %rbx, -24
 ; X64-NEXT:    movl %edi, %ebx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movq %rsp, %rdi
+; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    leaq 15(,%rax,4), %rax
 ; X64-NEXT:    andq $-16, %rax
-; X64-NEXT:    movq %rsp, %rdi
 ; X64-NEXT:    subq %rax, %rdi
 ; X64-NEXT:    cmpq %rdi, %fs:112
 ; X64-NEXT:    jg .LBB0_4
diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
index 056c6404f5cfa..76183ac5f8fa3 100644
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -1667,13 +1667,13 @@ define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(ptr %vp, <8 x float> %default
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vbroadcastsd (%eax), %ymm1
-; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
+; X86-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm1
-; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
+; X64-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; X64-NEXT:    retq
   %vec = load <2 x float>, ptr %vp
   %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index e14a12d80f28d..b7e46e51064c0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -1002,7 +1002,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[0],ymm10[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[1],ymm10[3],ymm4[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4]
@@ -1038,7 +1038,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm11
 ; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
@@ -1973,7 +1973,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[0,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm1[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[0],ymm8[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[1],ymm8[3],ymm13[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm11[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1990,7 +1990,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 320(%rdi), %ymm12
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[0],ymm4[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[1],ymm4[3],ymm12[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7]
 ; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2066,7 +2066,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm5
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,0],ymm7[4,5],ymm8[6,4]
 ; AVX-NEXT:    vmovaps %ymm7, %ymm13
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7]
@@ -2080,7 +2080,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 208(%rdi), %xmm0
 ; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3]
 ; AVX-NEXT:    vmovapd 272(%rdi), %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,0],ymm4[4,5],ymm7[6,4]
 ; AVX-NEXT:    vmovaps %ymm4, %ymm10
 ; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7]
@@ -4025,7 +4025,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm2[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4045,7 +4045,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4071,7 +4071,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[3],ymm4[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4096,7 +4096,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 704(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[0],ymm10[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[1],ymm10[3],ymm0[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
 ; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4265,7 +4265,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 464(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4283,7 +4283,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm15
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3]
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,0],ymm11[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1]
@@ -4301,7 +4301,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
 ; AVX-NEXT:    vmovapd 272(%rdi), %xmm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1]
@@ -4317,7 +4317,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 592(%rdi), %xmm8
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3]
 ; AVX-NEXT:    vmovapd 656(%rdi), %xmm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2,0],ymm4[4,5],ymm14[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7]
@@ -8052,7 +8052,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8075,7 +8075,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8103,7 +8103,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8130,7 +8130,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8157,7 +8157,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8184,7 +8184,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8211,7 +8211,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8239,7 +8239,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8604,7 +8604,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8625,7 +8625,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 272(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -8646,7 +8646,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 464(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1]
@@ -8665,8 +8665,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 656(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2]
+; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -8686,7 +8686,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 848(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1]
@@ -8705,8 +8705,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 1040(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2]
+; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -8726,8 +8726,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 1232(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2]
+; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
@@ -8746,8 +8746,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3]
 ; AVX-NEXT:    vmovapd 1424(%rdi), %xmm9
 ; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm15 = ymm9[1],mem[0],ymm9[2],mem[2]
+; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = ymm9[1],mem[0],ymm9[2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,0],ymm9[4,5],ymm15[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index 94e91f546a9a3..6f534ee9cdf0b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -227,7 +227,7 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = mem[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, 64(%rdi), %ymm1, %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2],ymm1[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm4
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3]
@@ -498,10 +498,10 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, 160(%rdi), %ymm5, %ymm5
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2],ymm5[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm1[0],ymm3[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm1[0],ymm3[3],ymm1[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm8
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm0[0],ymm6[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm0[0],ymm6[3],ymm0[3]
 ; AVX-NEXT:    vbroadcastsd 176(%rdi), %ymm8
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3]
@@ -942,16 +942,16 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = mem[0,1],ymm1[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[3]
 ; AVX-NEXT:    vbroadcastsd 176(%rdi), %ymm6
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[3]
 ; AVX-NEXT:    vbroadcastsd 368(%rdi), %ymm10
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm11
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[3]
 ; AVX-NEXT:    vbroadcastsd 272(%rdi), %ymm12
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3]
 ; AVX-NEXT:    vmovaps 112(%rdi), %xmm12
@@ -1761,41 +1761,41 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = mem[0,1],ymm13[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm15 = mem[0,1],ymm7[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[3]
 ; AVX-NEXT:    vbroadcastsd 176(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[3]
 ; AVX-NEXT:    vbroadcastsd 368(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[3]
 ; AVX-NEXT:    vbroadcastsd 560(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[3]
 ; AVX-NEXT:    vbroadcastsd 752(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 224(%rdi), %ymm3
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[3]
 ; AVX-NEXT:    vbroadcastsd 272(%rdi), %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = mem[0,1],ymm1[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[3]
 ; AVX-NEXT:    vbroadcastsd 464(%rdi), %ymm5
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm5[3]
 ; AVX-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[3]
 ; AVX-NEXT:    vbroadcastsd 656(%rdi), %ymm14
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3418,44 +3418,44 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd %ymm0, %ymm2
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[3]
 ; AVX-NEXT:    vbroadcastsd 176(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[3]
 ; AVX-NEXT:    vbroadcastsd 368(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[3]
 ; AVX-NEXT:    vbroadcastsd 560(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[3]
 ; AVX-NEXT:    vbroadcastsd 752(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[3]
 ; AVX-NEXT:    vbroadcastsd 944(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm9
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[3]
 ; AVX-NEXT:    vbroadcastsd 1136(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1280(%rdi), %ymm8
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm8[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[0],ymm0[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[0],ymm0[3],ymm8[3]
 ; AVX-NEXT:    vbroadcastsd 1328(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1472(%rdi), %ymm7
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[3]
 ; AVX-NEXT:    vbroadcastsd 1520(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3463,52 +3463,52 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 224(%rdi), %ymm13
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm13[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[3]
 ; AVX-NEXT:    vbroadcastsd 272(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm12
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm12[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[3]
 ; AVX-NEXT:    vbroadcastsd 464(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm10
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[3]
 ; AVX-NEXT:    vbroadcastsd 656(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 800(%rdi), %ymm5
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm14 = mem[0,1],ymm5[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[3]
 ; AVX-NEXT:    vbroadcastsd 848(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 992(%rdi), %ymm4
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = mem[0,1],ymm4[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[3]
 ; AVX-NEXT:    vbroadcastsd 1040(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1184(%rdi), %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = mem[0,1],ymm2[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[3]
 ; AVX-NEXT:    vbroadcastsd 1232(%rdi), %ymm3
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1376(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm1[0],ymm3[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm1[0],ymm3[3],ymm1[3]
 ; AVX-NEXT:    vbroadcastsd 1424(%rdi), %ymm15
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index 0648d1b4abdf6..07988a416bac4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -328,7 +328,7 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm8
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[3],ymm9[2]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovaps 96(%rdi), %xmm4
@@ -337,7 +337,7 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
@@ -818,12 +818,12 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3]
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[3],ymm14[2]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovaps 288(%rdi), %xmm14
 ; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[3],ymm10[2]
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3]
@@ -839,13 +839,13 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3],xmm15[4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3]
 ; AVX-NEXT:    vmovapd 288(%rdi), %ymm12
 ; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3]
@@ -1789,28 +1789,28 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 288(%rdi), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[3],ymm15[2]
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 608(%rdi), %xmm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 528(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm6
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[3],ymm11[2]
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 448(%rdi), %xmm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm5[0],ymm14[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 368(%rdi), %xmm14
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3]
@@ -1842,25 +1842,25 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm2[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 288(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[3],ymm14[2]
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm9[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
 ; AVX-NEXT:    vmovapd 448(%rdi), %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[3],ymm10[2]
 ; AVX-NEXT:    vmovdqa 384(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3]
@@ -3694,14 +3694,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 288(%rdi), %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 608(%rdi), %xmm9
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 528(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
@@ -3709,7 +3709,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 928(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 848(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3]
@@ -3717,14 +3717,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1248(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 1168(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3]
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm8
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -3733,7 +3733,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 448(%rdi), %xmm10
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 368(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
@@ -3741,7 +3741,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 768(%rdi), %xmm12
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 688(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm14[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
@@ -3749,7 +3749,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1088(%rdi), %xmm14
 ; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 1008(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -3823,14 +3823,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX-NEXT:    vmovapd 288(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[3],ymm13[2]
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm15[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm4
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[0],ymm14[0],ymm4[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[0],ymm14[1],ymm4[3],ymm14[2]
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm8 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
@@ -3839,7 +3839,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm5
 ; AVX-NEXT:    vmovapd 928(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
@@ -3848,28 +3848,28 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX-NEXT:    vmovapd 1248(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm9[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm7
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[3],ymm7[2]
 ; AVX-NEXT:    vmovdqa 1024(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
 ; AVX-NEXT:    vmovapd 768(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 704(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 448(%rdi), %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
 ; AVX-NEXT:    vmovdqa 384(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
@@ -3877,7 +3877,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -7764,7 +7764,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 288(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7773,7 +7773,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 608(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 528(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7782,7 +7782,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 928(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 848(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7792,7 +7792,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1168(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7801,7 +7801,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1568(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1488(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7809,14 +7809,14 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 1888(%rdi), %xmm11
 ; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1808(%rdi), %xmm12
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 2208(%rdi), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 2128(%rdi), %xmm14
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
@@ -7824,7 +7824,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 2528(%rdi), %xmm15
 ; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 2448(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7835,7 +7835,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7846,7 +7846,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 368(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7857,7 +7857,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 688(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7868,7 +7868,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1008(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7878,7 +7878,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1408(%rdi), %xmm8
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1328(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
@@ -7887,7 +7887,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1728(%rdi), %xmm6
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1648(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
@@ -7896,7 +7896,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 2048(%rdi), %xmm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1968(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
@@ -7905,7 +7905,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 2368(%rdi), %xmm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 2288(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm9 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
@@ -8065,7 +8065,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm9
 ; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm9
 ; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
@@ -8074,7 +8074,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 288(%rdi), %ymm9
 ; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm9
 ; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
@@ -8084,7 +8084,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
 ; AVX-NEXT:    vmovapd 448(%rdi), %ymm7
 ; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[3],ymm7[2]
 ; AVX-NEXT:    vmovdqa 384(%rdi), %xmm7
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm8[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
@@ -8093,7 +8093,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm7
 ; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[3],ymm7[2]
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm7
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
@@ -8103,7 +8103,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
 ; AVX-NEXT:    vmovapd 768(%rdi), %ymm6
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 704(%rdi), %xmm6
 ; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
@@ -8112,7 +8112,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 928(%rdi), %ymm5
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm5
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
@@ -8122,7 +8122,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 1024(%rdi), %xmm4
 ; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
@@ -8131,7 +8131,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 1248(%rdi), %ymm3
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
 ; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
@@ -8141,7 +8141,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
 ; AVX-NEXT:    vmovapd 1408(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 1344(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
@@ -8149,7 +8149,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 1568(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[3],ymm14[2]
 ; AVX-NEXT:    vmovdqa 1504(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
@@ -8157,7 +8157,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 1728(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
@@ -8165,7 +8165,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 1888(%rdi), %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[3],ymm10[2]
 ; AVX-NEXT:    vmovdqa 1824(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
@@ -8173,7 +8173,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 2048(%rdi), %ymm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
 ; AVX-NEXT:    vmovdqa 1984(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
@@ -8181,7 +8181,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 2208(%rdi), %ymm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 2144(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
@@ -8189,7 +8189,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 2368(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 2304(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
@@ -8197,7 +8197,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 2528(%rdi), %ymm2
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 2464(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index c5be77db9ecf5..4e5501b1041d3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -55,7 +55,7 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} xmm4 = mem[0,1],xmm1[2,3]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm0[0],mem[1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
@@ -451,7 +451,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm9
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm10 = mem[0],xmm3[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[3],ymm4[2]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovdqa 128(%rdi), %xmm8
@@ -459,7 +459,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
@@ -468,7 +468,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3]
@@ -1225,10 +1225,10 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm14 = xmm13[0],xmm4[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[3],ymm8[2]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 288(%rdi), %xmm14
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3]
@@ -1243,13 +1243,13 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd 240(%rdi), %xmm14
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3]
 ; AVX-NEXT:    vmovapd 304(%rdi), %xmm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[3]
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
@@ -1266,12 +1266,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3]
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[3],ymm15[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3]
@@ -2817,23 +2817,23 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 288(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 384(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 736(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 832(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[3],ymm4[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 160(%rdi), %ymm2
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
 ; AVX-NEXT:    vmovdqa 512(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
@@ -2867,28 +2867,28 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 752(%rdi), %xmm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[3]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 304(%rdi), %xmm7
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 528(%rdi), %xmm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[3]
 ; AVX-NEXT:    vmovdqa 640(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
@@ -2917,22 +2917,22 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm2
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[0],ymm6[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[1],ymm6[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovapd 864(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 768(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 640(%rdi), %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[0],ymm7[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[1],ymm7[3],ymm3[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3]
@@ -5858,49 +5858,49 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 288(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 384(%rdi), %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 736(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 832(%rdi), %ymm7
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[3],ymm7[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1280(%rdi), %ymm6
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[3],ymm6[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1632(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1728(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[3],ymm11[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 160(%rdi), %ymm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[3],ymm8[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[0],ymm9[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[1],ymm9[3],ymm14[2]
 ; AVX-NEXT:    vmovdqa 512(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1056(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
 ; AVX-NEXT:    vmovdqa 960(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1504(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 1408(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
@@ -5966,14 +5966,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 304(%rdi), %xmm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 752(%rdi), %xmm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[3]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -5981,7 +5981,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1200(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1312(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
@@ -5990,7 +5990,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1648(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1760(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
@@ -6000,21 +6000,21 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1424(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1536(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 976(%rdi), %xmm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[3]
 ; AVX-NEXT:    vmovdqa 1088(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 528(%rdi), %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 640(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
@@ -6023,7 +6023,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm3
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
@@ -6092,14 +6092,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[3],ymm11[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm10
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[3],ymm10[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
@@ -6108,11 +6108,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 640(%rdi), %ymm5
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 864(%rdi), %ymm7
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[3],ymm7[2]
 ; AVX-NEXT:    vmovdqa 768(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
@@ -6121,11 +6121,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm3
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[3],ymm3[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1312(%rdi), %ymm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[0],ymm1[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[1],ymm1[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 1216(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3]
@@ -6133,11 +6133,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1536(%rdi), %ymm8
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[3],ymm8[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1760(%rdi), %ymm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3]
@@ -12923,40 +12923,40 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 384(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[3],ymm2[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 736(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 832(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[0],ymm4[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[1],ymm4[3],ymm14[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1280(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[3],ymm13[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1632(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1728(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[0],ymm6[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[1],ymm6[3],ymm12[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 2080(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2176(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 2528(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2624(%rdi), %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[0],ymm8[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[3],ymm10[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 2976(%rdi), %xmm1
@@ -12964,20 +12964,20 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 3072(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 3424(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 3520(%rdi), %ymm15
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[3],ymm15[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 160(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -12986,7 +12986,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 512(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -12996,7 +12996,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 1056(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 960(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13006,7 +13006,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 1504(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1408(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -13014,7 +13014,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1952(%rdi), %ymm9
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 1856(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13023,7 +13023,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2400(%rdi), %ymm6
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 2304(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
@@ -13031,7 +13031,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2848(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 2752(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
@@ -13039,7 +13039,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 3296(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 3200(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
@@ -13189,7 +13189,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3]
 ; AVX-NEXT:    vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm12
 ; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
@@ -13199,7 +13199,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 304(%rdi), %xmm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
@@ -13208,7 +13208,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 528(%rdi), %xmm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[3]
 ; AVX-NEXT:    vmovdqa 640(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
@@ -13217,7 +13217,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 752(%rdi), %xmm9
 ; AVX-NEXT:    vmovupd %ymm9, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
@@ -13226,7 +13226,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3]
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 976(%rdi), %xmm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 1088(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
@@ -13235,7 +13235,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 1200(%rdi), %xmm6
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[3]
 ; AVX-NEXT:    vmovdqa 1312(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
@@ -13245,7 +13245,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1424(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1536(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -13254,7 +13254,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1648(%rdi), %xmm7
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
 ; AVX-NEXT:    vmovdqa 1760(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
@@ -13264,7 +13264,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1872(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1984(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -13274,7 +13274,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 2096(%rdi), %xmm3
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
 ; AVX-NEXT:    vmovdqa 2208(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
@@ -13284,7 +13284,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2320(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 2432(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
@@ -13292,7 +13292,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2544(%rdi), %xmm14
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[3]
 ; AVX-NEXT:    vmovdqa 2656(%rdi), %xmm4
 ; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
@@ -13303,7 +13303,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 2768(%rdi), %xmm4
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[3]
 ; AVX-NEXT:    vmovdqa 2880(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
@@ -13313,7 +13313,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 2992(%rdi), %xmm5
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
 ; AVX-NEXT:    vmovdqa 3104(%rdi), %xmm5
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
@@ -13324,7 +13324,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 3216(%rdi), %xmm5
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
 ; AVX-NEXT:    vmovdqa 3328(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
@@ -13335,7 +13335,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 3440(%rdi), %xmm5
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
 ; AVX-NEXT:    vmovdqa 3552(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
@@ -13486,14 +13486,14 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13505,12 +13505,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 640(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[3],ymm1[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 864(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 768(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13522,13 +13522,13 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[3],ymm1[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1312(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 1216(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13538,13 +13538,13 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 1440(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1536(%rdi), %ymm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[3],ymm15[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1760(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
@@ -13553,11 +13553,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 1888(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1984(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[0],ymm7[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2208(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 2112(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
@@ -13567,12 +13567,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2432(%rdi), %ymm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[0],ymm3[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[3],ymm9[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2656(%rdi), %ymm8
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
 ; AVX-NEXT:    vmovdqa 2560(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
@@ -13582,12 +13582,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2880(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 3104(%rdi), %ymm6
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 3008(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
@@ -13597,12 +13597,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 3328(%rdi), %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[0],ymm14[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[1],ymm14[3],ymm3[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 3552(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 3456(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index b93fd4b8f62fb..779ae33a27715 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -725,7 +725,7 @@ define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_0z3z:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4>
   ret <4 x double> %shuffle
@@ -735,7 +735,7 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_1z2z:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2]
+; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3]
 ; ALL-NEXT:    retq
   %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
   ret <4 x double> %1
@@ -1202,7 +1202,7 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_2u35:
@@ -1952,7 +1952,7 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_1z2z:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z:
@@ -2038,9 +2038,7 @@ define <4 x double> @add_v4f64_024u_135u_reverse(<4 x double> %a, <4 x double> %
 ; AVX1-LABEL: add_v4f64_024u_135u_reverse:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT:    vshufpd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: add_v4f64_024u_135u_reverse:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 97b262cc7ac5c..a706e4ce00ed3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -790,7 +790,7 @@ define <8 x double> @shuffle_v8f64_1z2z5z6z(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_1z2z5z6z:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> <double 0.000000e+00, double undef, double undef, double undef, double undef, double undef, double undef, double undef>, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 5, i32 8, i32 6, i32 8>
   ret <8 x double> %shuffle
@@ -1663,7 +1663,7 @@ define <8 x double> @shuffle_v8f64_z9zbzdzf(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_z9zbzdzf:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7]
+; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> zeroinitializer, <8 x double> %b, <8 x i32><i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
   ret <8 x double> %shuffle
diff --git a/llvm/test/DebugInfo/Generic/artificial-static-member.ll b/llvm/test/DebugInfo/Generic/artificial-static-member.ll
index 5c247d6959bf7..08f15a226a87c 100644
--- a/llvm/test/DebugInfo/Generic/artificial-static-member.ll
+++ b/llvm/test/DebugInfo/Generic/artificial-static-member.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86_64-linux
+; REQUIRES: target={{x86_64-.*-linux.*}}
 ; RUN: llc -O0 -filetype=obj < %s |   \
 ; RUN: llvm-dwarfdump --debug-info - | FileCheck %s
 
diff --git a/llvm/test/MC/AArch64/SVE/fexpa.s b/llvm/test/MC/AArch64/SVE/fexpa.s
index 1171efc2d6466..c51b1e2b1d3e5 100644
--- a/llvm/test/MC/AArch64/SVE/fexpa.s
+++ b/llvm/test/MC/AArch64/SVE/fexpa.s
@@ -1,5 +1,7 @@
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \
@@ -12,17 +14,17 @@
 fexpa z0.h, z31.h
 // CHECK-INST: fexpa	z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0xbb,0x60,0x04]
-// CHECK-ERROR: instruction requires: sve
+// CHECK-ERROR: instruction requires: sve or sme2p2
 // CHECK-UNKNOWN: 0460bbe0 <unknown>
 
 fexpa z0.s, z31.s
 // CHECK-INST: fexpa	z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0xbb,0xa0,0x04]
-// CHECK-ERROR: instruction requires: sve
+// CHECK-ERROR: instruction requires: sve or sme2p2
 // CHECK-UNKNOWN: 04a0bbe0 <unknown>
 
 fexpa z0.d, z31.d
 // CHECK-INST: fexpa	z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0xbb,0xe0,0x04]
-// CHECK-ERROR: instruction requires: sve
+// CHECK-ERROR: instruction requires: sve or sme2p2
 // CHECK-UNKNOWN: 04e0bbe0 <unknown>
diff --git a/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s b/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s
index fde3d2057b2ad..d3ca4281dca41 100644
--- a/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s
+++ b/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck -check-prefix=GFX940 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX940 %s
 
 scratch_load_dword a2, v4, s6
 // GFX940: scratch_load_dword a2, v4, s6           ; encoding: [0x00,0x60,0x50,0xdc,0x04,0x00,0x86,0x02]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s
index 2a4b3ea201701..0f2852fc531ed 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s
@@ -648,50 +648,56 @@ v_dot2c_f32_f16 v5, src_scc, v2
 v_dot2c_f32_f16 v255, 0xfe0b, v255
 // GFX11: encoding: [0xff,0xfe,0xff,0x05,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, v1, v2, 0xfe0b
+v_fmaak_f16 v5.l, v1.l, v2.l, 0xfe0b
 // GFX11: encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, v127, v2, 0xfe0b
+v_fmaak_f16 v5.l, v127.l, v2.l, 0xfe0b
 // GFX11: encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, s1, v2, 0xfe0b
+v_fmaak_f16 v5.l, v1.h, v2.l, 0xfe0b
+// GFX11: encoding: [0x81,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+
+v_fmaak_f16 v5.l, v127.h, v2.l, 0xfe0b
+// GFX11: encoding: [0xff,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+
+v_fmaak_f16 v5.l, s1, v2.l, 0xfe0b
 // GFX11: encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, s105, v2, 0xfe0b
+v_fmaak_f16 v5.l, s105, v2.l, 0xfe0b
 // GFX11: encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b
+v_fmaak_f16 v5.l, vcc_lo, v2.l, 0xfe0b
 // GFX11: encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b
+v_fmaak_f16 v5.l, vcc_hi, v2.l, 0xfe0b
 // GFX11: encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, ttmp15, v2, 0xfe0b
+v_fmaak_f16 v5.l, ttmp15, v2.l, 0xfe0b
 // GFX11: encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, m0, v2, 0xfe0b
+v_fmaak_f16 v5.l, m0, v2.l, 0xfe0b
 // GFX11: encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, exec_lo, v2, 0xfe0b
+v_fmaak_f16 v5.l, exec_lo, v2.l, 0xfe0b
 // GFX11: encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, exec_hi, v2, 0xfe0b
+v_fmaak_f16 v5.l, exec_hi, v2.l, 0xfe0b
 // GFX11: encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, null, v2, 0xfe0b
+v_fmaak_f16 v5.l, null, v2.l, 0xfe0b
 // GFX11: encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, -1, v2, 0xfe0b
+v_fmaak_f16 v5.l, -1, v2.l, 0xfe0b
 // GFX11: encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, 0.5, v2, 0xfe0b
-// GFX11: encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+v_fmaak_f16 v127.l, 0.5, v127.l, 0xfe0b
+// GFX11: encoding: [0xf0,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, src_scc, v2, 0xfe0b
-// GFX11: encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+v_fmaak_f16 v5.h, src_scc, v2.h, 0xfe0b
+// GFX11: encoding: [0xfd,0x04,0x0b,0x71,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b
-// GFX11: encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
+v_fmaak_f16 v127.h, 0xfe0b, v127.h, 0xfe0b
+// GFX11: encoding: [0xff,0xfe,0xff,0x71,0x0b,0xfe,0x00,0x00]
 
 v_fmaak_f32 v5, v1, v2, 0xaf123456
 // GFX11: encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf]
@@ -783,50 +789,56 @@ v_fmac_dx9_zero_f32 v5, src_scc, v2
 v_fmac_dx9_zero_f32 v255, 0xaf123456, v255
 // GFX11: encoding: [0xff,0xfe,0xff,0x0d,0x56,0x34,0x12,0xaf]
 
-v_fmac_f16 v5, v1, v2
+v_fmac_f16 v5.l, v1.l, v2.l
 // GFX11: encoding: [0x01,0x05,0x0a,0x6c]
 
-v_fmac_f16 v5, v127, v2
+v_fmac_f16 v5.l, v127.l, v2.l
 // GFX11: encoding: [0x7f,0x05,0x0a,0x6c]
 
-v_fmac_f16 v5, s1, v2
+v_fmac_f16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x81,0x05,0x0a,0x6c]
+
+v_fmac_f16 v5.l, v127.h, v2.l
+// GFX11: encoding: [0xff,0x05,0x0a,0x6c]
+
+v_fmac_f16 v5.l, s1, v2.l
 // GFX11: encoding: [0x01,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, s105, v2
+v_fmac_f16 v5.l, s105, v2.l
 // GFX11: encoding: [0x69,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, vcc_lo, v2
+v_fmac_f16 v5.l, vcc_lo, v2.l
 // GFX11: encoding: [0x6a,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, vcc_hi, v2
+v_fmac_f16 v5.l, vcc_hi, v2.l
 // GFX11: encoding: [0x6b,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, ttmp15, v2
+v_fmac_f16 v5.l, ttmp15, v2.l
 // GFX11: encoding: [0x7b,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, m0, v2
+v_fmac_f16 v5.l, m0, v2.l
 // GFX11: encoding: [0x7d,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, exec_lo, v2
+v_fmac_f16 v5.l, exec_lo, v2.l
 // GFX11: encoding: [0x7e,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, exec_hi, v2
+v_fmac_f16 v5.l, exec_hi, v2.l
 // GFX11: encoding: [0x7f,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, null, v2
+v_fmac_f16 v5.l, null, v2.l
 // GFX11: encoding: [0x7c,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, -1, v2
+v_fmac_f16 v5.l, -1, v2.l
 // GFX11: encoding: [0xc1,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, 0.5, v2
-// GFX11: encoding: [0xf0,0x04,0x0a,0x6c]
+v_fmac_f16 v127.l, 0.5, v127.l
+// GFX11: encoding: [0xf0,0xfe,0xfe,0x6c]
 
-v_fmac_f16 v5, src_scc, v2
-// GFX11: encoding: [0xfd,0x04,0x0a,0x6c]
+v_fmac_f16 v5.h, src_scc, v2.h
+// GFX11: encoding: [0xfd,0x04,0x0b,0x6d]
 
-v_fmac_f16 v127, 0xfe0b, v127
-// GFX11: encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00]
+v_fmac_f16 v127.h, 0xfe0b, v127.h
+// GFX11: encoding: [0xff,0xfe,0xff,0x6d,0x0b,0xfe,0x00,0x00]
 
 v_fmac_f32 v5, v1, v2
 // GFX11: encoding: [0x01,0x05,0x0a,0x56]
@@ -918,50 +930,56 @@ v_fmac_legacy_f32 v5, src_scc, v2
 v_fmac_legacy_f32 v255, 0xaf123456, v255
 // GFX11: encoding: [0xff,0xfe,0xff,0x0d,0x56,0x34,0x12,0xaf]
 
-v_fmamk_f16 v5, v1, 0xfe0b, v3
+v_fmamk_f16 v5.l, v1.l, 0xfe0b, v3.l
 // GFX11: encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, v127, 0xfe0b, v3
+v_fmamk_f16 v5.l, v127.l, 0xfe0b, v3.l
 // GFX11: encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, s1, 0xfe0b, v3
+v_fmamk_f16 v5.l, v1.h, 0xfe0b, v3.l
+// GFX11: encoding: [0x81,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+
+v_fmamk_f16 v5.l, v127.h, 0xfe0b, v3.l
+// GFX11: encoding: [0xff,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+
+v_fmamk_f16 v5.l, s1, 0xfe0b, v3.l
 // GFX11: encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, s105, 0xfe0b, v3
+v_fmamk_f16 v5.l, s105, 0xfe0b, v3.l
 // GFX11: encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3
+v_fmamk_f16 v5.l, vcc_lo, 0xfe0b, v3.l
 // GFX11: encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3
+v_fmamk_f16 v5.l, vcc_hi, 0xfe0b, v3.l
 // GFX11: encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, ttmp15, 0xfe0b, v3
+v_fmamk_f16 v5.l, ttmp15, 0xfe0b, v3.l
 // GFX11: encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, m0, 0xfe0b, v3
+v_fmamk_f16 v5.l, m0, 0xfe0b, v3.l
 // GFX11: encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, exec_lo, 0xfe0b, v3
+v_fmamk_f16 v5.l, exec_lo, 0xfe0b, v3.l
 // GFX11: encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, exec_hi, 0xfe0b, v3
+v_fmamk_f16 v5.l, exec_hi, 0xfe0b, v3.l
 // GFX11: encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, null, 0xfe0b, v3
+v_fmamk_f16 v5.l, null, 0xfe0b, v3.l
 // GFX11: encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, -1, 0xfe0b, v3
+v_fmamk_f16 v5.l, -1, 0xfe0b, v3.l
 // GFX11: encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, 0.5, 0xfe0b, v3
-// GFX11: encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+v_fmamk_f16 v127.l, 0.5, 0xfe0b, v127.l
+// GFX11: encoding: [0xf0,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, src_scc, 0xfe0b, v3
-// GFX11: encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+v_fmamk_f16 v5.h, src_scc, 0xfe0b, v3.h
+// GFX11: encoding: [0xfd,0x06,0x0b,0x6f,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127
-// GFX11: encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
+v_fmamk_f16 v127.h, 0xfe0b, 0xfe0b, v127.h
+// GFX11: encoding: [0xff,0xfe,0xff,0x6f,0x0b,0xfe,0x00,0x00]
 
 v_fmamk_f32 v5, v1, 0xaf123456, v3
 // GFX11: encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s
index 3eff00bb96e47..f40278cb9c42e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s
@@ -629,47 +629,47 @@ v_dot2c_f32_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f
 v_dot2c_f32_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: encoding: [0xfa,0xfe,0xff,0x05,0xff,0x6f,0xf5,0x30]
 
-v_fmac_f16 v5, v1, v2 quad_perm:[3,2,1,0]
+v_fmac_f16 v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff]
 
-v_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3]
+v_fmac_f16 v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff]
 
-v_fmac_f16 v5, v1, v2 row_mirror
+v_fmac_f16 v5.l, v1.l, v2.l row_mirror
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_half_mirror
+v_fmac_f16 v5.l, v1.l, v2.l row_half_mirror
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_shl:1
+v_fmac_f16 v5.l, v1.l, v2.l row_shl:1
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_shl:15
+v_fmac_f16 v5.l, v1.l, v2.l row_shl:15
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_shr:1
+v_fmac_f16 v5.l, v1.l, v2.l row_shr:1
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_shr:15
+v_fmac_f16 v5.l, v1.l, v2.l row_shr:15
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_ror:1
+v_fmac_f16 v5.l, v1.l, v2.l row_ror:1
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_ror:15
+v_fmac_f16 v5.l, v1.l, v2.l row_ror:15
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_fmac_f16 v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01]
+v_fmac_f16 v127.l, v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x5f,0x01,0x01]
 
-v_fmac_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x09,0x13]
+v_fmac_f16 v5.h, v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: encoding: [0xfa,0x04,0x0b,0x6d,0x81,0x60,0x09,0x13]
 
-v_fmac_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xf5,0x30]
+v_fmac_f16 v127.h, -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: encoding: [0xfa,0xfe,0xff,0x6d,0xff,0x6f,0xf5,0x30]
 
 v_fmac_f32 v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s
index 0f19cf0028525..ffec9f3a7ec09 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s
@@ -132,14 +132,17 @@ v_dot2c_f32_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_dot2c_f32_f16 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: encoding: [0xe9,0xfe,0xff,0x05,0xff,0x00,0x00,0x00]
 
-v_fmac_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_fmac_f16 v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
 
-v_fmac_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: encoding: [0xea,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
+v_fmac_f16 v127.l, v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: encoding: [0xe9,0xfe,0xfe,0x6c,0x7f,0x77,0x39,0x05]
 
-v_fmac_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: encoding: [0xe9,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00]
+v_fmac_f16 v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: encoding: [0xea,0x04,0x0b,0x6d,0x81,0x77,0x39,0x05]
+
+v_fmac_f16 v127.h, v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: encoding: [0xe9,0xfe,0xff,0x6d,0xff,0x00,0x00,0x00]
 
 v_fmac_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
index dd619f3077f70..2027a436fa72f 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
@@ -29,50 +29,95 @@ v_add_f16_e32 v5, v1, v255
 v_add_f16_e32 v5, v255, v2
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
-v_fmaak_f16_e32 v255, v1, v2, 0xfe0b
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v255.h, v1.h, v2.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
 
-v_fmaak_f16_e32 v5, v1, v255, 0xfe0b
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v255.l, v1.l, v2.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
 
-v_fmaak_f16_e32 v5, v255, v2, 0xfe0b
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v5.h, v1.h, v255.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
 
-v_fmac_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v5.h, v255.h, v2.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
 
-v_fmac_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v5.l, v1.l, v255.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
 
-v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v5.l, v255.l, v2.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
 
-v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v255.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction
 
-v_fmac_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v255.h, v1.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction
 
-v_fmac_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v255.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction
 
-v_fmac_f16_e32 v255, v1, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v255.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction
 
-v_fmac_f16_e32 v5, v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.h, v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
 
-v_fmac_f16_e32 v5, v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.h, v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
 
-v_fmamk_f16_e32 v255, v1, 0xfe0b, v3
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.h, v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
 
-v_fmamk_f16_e32 v5, v1, 0xfe0b, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.h, v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
 
-v_fmamk_f16_e32 v5, v255, 0xfe0b, v3
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.l, v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_fmac_f16_dpp v5.l, v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_fmac_f16_dpp v5.l, v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_fmac_f16_dpp v5.l, v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_fmac_f16_e32 v255.h, v1.h, v2.h
+// GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction
+
+v_fmac_f16_e32 v255.l, v1.l, v2.l
+// GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction
+
+v_fmac_f16_e32 v5.h, v1.h, v255.h
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_fmac_f16_e32 v5.h, v255.h, v2.h
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_fmac_f16_e32 v5.l, v1.l, v255.l
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_fmac_f16_e32 v5.l, v255.l, v2.l
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v255.h, v1.h, 0xfe0b, v3.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v255.l, v1.l, 0xfe0b, v3.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v5.h, v1.h, 0xfe0b, v255.h
+// GFX11: :[[@LINE-1]]:37: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v5.h, v255.h, 0xfe0b, v3.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v5.l, v1.l, 0xfe0b, v255.l
+// GFX11: :[[@LINE-1]]:37: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v5.l, v255.l, 0xfe0b, v3.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_ldexp_f16_dpp v255.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc-fake16.s
index faa2b1f976999..4b98cdaa00d7f 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopc-fake16.s
@@ -1,7 +1,7 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
 
 v_cmp_class_f16_e64 s5, v1, v2
 // W32: encoding: [0x05,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s
index 5593ea77d9424..d777f6d29a613 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s
@@ -594,51 +594,63 @@ v_cvt_pkrtz_f16_f32 v5, src_scc, v2
 v_cvt_pkrtz_f16_f32 v255, 0xaf123456, v255
 // GFX12: encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf]
 
-v_fmaak_f16 v5, v1, v2, 0xfe0b
+v_fmaak_f16 v5.l, v1.l, v2.l, 0xfe0b
 // GFX12: encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, v127, v2, 0xfe0b
+v_fmaak_f16 v5.l, v127.l, v2.l, 0xfe0b
 // GFX12: encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, s1, v2, 0xfe0b
+v_fmaak_f16 v5.l, s1, v2.l, 0xfe0b
 // GFX12: encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, s105, v2, 0xfe0b
+v_fmaak_f16 v5.l, s105, v2.l, 0xfe0b
 // GFX12: encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b
+v_fmaak_f16 v5.l, vcc_lo, v2.l, 0xfe0b
 // GFX12: encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b
+v_fmaak_f16 v5.l, vcc_hi, v2.l, 0xfe0b
 // GFX12: encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, ttmp15, v2, 0xfe0b
+v_fmaak_f16 v5.l, ttmp15, v2.l, 0xfe0b
 // GFX12: encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, m0, v2, 0xfe0b
+v_fmaak_f16 v5.l, m0, v2.l, 0xfe0b
 // GFX12: encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, exec_lo, v2, 0xfe0b
+v_fmaak_f16 v5.l, exec_lo, v2.l, 0xfe0b
 // GFX12: encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, exec_hi, v2, 0xfe0b
+v_fmaak_f16 v5.l, exec_hi, v2.l, 0xfe0b
 // GFX12: encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, null, v2, 0xfe0b
+v_fmaak_f16 v5.l, null, v2.l, 0xfe0b
 // GFX12: encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, -1, v2, 0xfe0b
+v_fmaak_f16 v5.l, -1, v2.l, 0xfe0b
 // GFX12: encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, 0.5, v2, 0xfe0b
+v_fmaak_f16 v5.l, 0.5, v2.l, 0xfe0b
 // GFX12: encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v5, src_scc, v2, 0xfe0b
+v_fmaak_f16 v5.l, src_scc, v2.l, 0xfe0b
 // GFX12: encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b
+v_fmaak_f16 v127.l, 0xfe0b, v127.l, 0xfe0b
 // GFX12: encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
 
+v_fmaak_f16 v5.l, v1.h, v2.l, 0xfe0b
+// GFX12: encoding: [0x81,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+
+v_fmaak_f16 v5.l, v127.h, v2.l, 0xfe0b
+// GFX12: encoding: [0xff,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+
+v_fmaak_f16 v5.h, src_scc, v2.h, 0xfe0b
+// GFX12: encoding: [0xfd,0x04,0x0b,0x71,0x0b,0xfe,0x00,0x00]
+
+v_fmaak_f16 v127.h, 0xfe0b, v127.h, 0xfe0b
+// GFX12: encoding: [0xff,0xfe,0xff,0x71,0x0b,0xfe,0x00,0x00]
+
 v_fmaak_f32 v5, v1, v2, 0xaf123456
 // GFX12: encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf]
 
@@ -684,51 +696,63 @@ v_fmaak_f32 v5, src_scc, v2, 0xaf123456
 v_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456
 // GFX12: encoding: [0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf]
 
-v_fmac_f16 v5, v1, v2
+v_fmac_f16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x01,0x05,0x0a,0x6c]
 
-v_fmac_f16 v5, v127, v2
+v_fmac_f16 v5.l, v127.l, v2.l
 // GFX12: encoding: [0x7f,0x05,0x0a,0x6c]
 
-v_fmac_f16 v5, s1, v2
+v_fmac_f16 v5.l, s1, v2.l
 // GFX12: encoding: [0x01,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, s105, v2
+v_fmac_f16 v5.l, s105, v2.l
 // GFX12: encoding: [0x69,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, vcc_lo, v2
+v_fmac_f16 v5.l, vcc_lo, v2.l
 // GFX12: encoding: [0x6a,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, vcc_hi, v2
+v_fmac_f16 v5.l, vcc_hi, v2.l
 // GFX12: encoding: [0x6b,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, ttmp15, v2
+v_fmac_f16 v5.l, ttmp15, v2.l
 // GFX12: encoding: [0x7b,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, m0, v2
+v_fmac_f16 v5.l, m0, v2.l
 // GFX12: encoding: [0x7d,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, exec_lo, v2
+v_fmac_f16 v5.l, exec_lo, v2.l
 // GFX12: encoding: [0x7e,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, exec_hi, v2
+v_fmac_f16 v5.l, exec_hi, v2.l
 // GFX12: encoding: [0x7f,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, null, v2
+v_fmac_f16 v5.l, null, v2.l
 // GFX12: encoding: [0x7c,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, -1, v2
+v_fmac_f16 v5.l, -1, v2.l
 // GFX12: encoding: [0xc1,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, 0.5, v2
+v_fmac_f16 v5.l, 0.5, v2.l
 // GFX12: encoding: [0xf0,0x04,0x0a,0x6c]
 
-v_fmac_f16 v5, src_scc, v2
+v_fmac_f16 v5.l, src_scc, v2.l
 // GFX12: encoding: [0xfd,0x04,0x0a,0x6c]
 
-v_fmac_f16 v127, 0xfe0b, v127
+v_fmac_f16 v127.l, 0xfe0b, v127.l
 // GFX12: encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00]
 
+v_fmac_f16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x81,0x05,0x0a,0x6c]
+
+v_fmac_f16 v5.l, v127.h, v2.l
+// GFX12: encoding: [0xff,0x05,0x0a,0x6c]
+
+v_fmac_f16 v5.h, src_scc, v2.h
+// GFX12: encoding: [0xfd,0x04,0x0b,0x6d]
+
+v_fmac_f16 v127.h, 0xfe0b, v127.h
+// GFX12: encoding: [0xff,0xfe,0xff,0x6d,0x0b,0xfe,0x00,0x00]
+
 v_fmac_f32 v5, v1, v2
 // GFX12: encoding: [0x01,0x05,0x0a,0x56]
 
@@ -774,51 +798,63 @@ v_fmac_f32 v5, src_scc, v2
 v_fmac_f32 v255, 0xaf123456, v255
 // GFX12: encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf]
 
-v_fmamk_f16 v5, v1, 0xfe0b, v3
+v_fmamk_f16 v5.l, v1.l, 0xfe0b, v3.l
 // GFX12: encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, v127, 0xfe0b, v3
+v_fmamk_f16 v5.l, v127.l, 0xfe0b, v3.l
 // GFX12: encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, s1, 0xfe0b, v3
+v_fmamk_f16 v5.l, s1, 0xfe0b, v3.l
 // GFX12: encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, s105, 0xfe0b, v3
+v_fmamk_f16 v5.l, s105, 0xfe0b, v3.l
 // GFX12: encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3
+v_fmamk_f16 v5.l, vcc_lo, 0xfe0b, v3.l
 // GFX12: encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3
+v_fmamk_f16 v5.l, vcc_hi, 0xfe0b, v3.l
 // GFX12: encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, ttmp15, 0xfe0b, v3
+v_fmamk_f16 v5.l, ttmp15, 0xfe0b, v3.l
 // GFX12: encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, m0, 0xfe0b, v3
+v_fmamk_f16 v5.l, m0, 0xfe0b, v3.l
 // GFX12: encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, exec_lo, 0xfe0b, v3
+v_fmamk_f16 v5.l, exec_lo, 0xfe0b, v3.l
 // GFX12: encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, exec_hi, 0xfe0b, v3
+v_fmamk_f16 v5.l, exec_hi, 0xfe0b, v3.l
 // GFX12: encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, null, 0xfe0b, v3
+v_fmamk_f16 v5.l, null, 0xfe0b, v3.l
 // GFX12: encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, -1, 0xfe0b, v3
+v_fmamk_f16 v5.l, -1, 0xfe0b, v3.l
 // GFX12: encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, 0.5, 0xfe0b, v3
+v_fmamk_f16 v5.l, 0.5, 0xfe0b, v3.l
 // GFX12: encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v5, src_scc, 0xfe0b, v3
+v_fmamk_f16 v5.l, src_scc, 0xfe0b, v3.l
 // GFX12: encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127
+v_fmamk_f16 v127.l, 0xfe0b, 0xfe0b, v127.l
 // GFX12: encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
 
+v_fmamk_f16 v5.l, v1.h, 0xfe0b, v3.l
+// GFX12: encoding: [0x81,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+
+v_fmamk_f16 v5.l, v127.h, 0xfe0b, v3.l
+// GFX12: encoding: [0xff,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+
+v_fmamk_f16 v5.h, src_scc, 0xfe0b, v3.h
+// GFX12: encoding: [0xfd,0x06,0x0b,0x6f,0x0b,0xfe,0x00,0x00]
+
+v_fmamk_f16 v127.h, 0xfe0b, 0xfe0b, v127.h
+// GFX12: encoding: [0xff,0xfe,0xff,0x6f,0x0b,0xfe,0x00,0x00]
+
 v_fmamk_f32 v5, v1, 0xaf123456, v3
 // GFX12: encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s
index a0f93f459f915..4424fced3e3ea 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s
@@ -521,48 +521,54 @@ v_cvt_pkrtz_f16_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl
 v_cvt_pkrtz_f16_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xf5,0x30]
 
-v_fmac_f16 v5, v1, v2 quad_perm:[3,2,1,0]
+v_fmac_f16 v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff]
 
-v_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3]
+v_fmac_f16 v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff]
 
-v_fmac_f16 v5, v1, v2 row_mirror
+v_fmac_f16 v5.l, v1.l, v2.l row_mirror
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_half_mirror
+v_fmac_f16 v5.l, v1.l, v2.l row_half_mirror
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_shl:1
+v_fmac_f16 v5.l, v1.l, v2.l row_shl:1
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_shl:15
+v_fmac_f16 v5.l, v1.l, v2.l row_shl:15
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_shr:1
+v_fmac_f16 v5.l, v1.l, v2.l row_shr:1
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_shr:15
+v_fmac_f16 v5.l, v1.l, v2.l row_shr:15
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_ror:1
+v_fmac_f16 v5.l, v1.l, v2.l row_ror:1
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_ror:15
+v_fmac_f16 v5.l, v1.l, v2.l row_ror:15
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_fmac_f16 v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff]
 
-v_fmac_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_fmac_f16 v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01]
 
-v_fmac_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_fmac_f16 v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x09,0x13]
 
-v_fmac_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+v_fmac_f16 v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xf5,0x30]
 
+v_fmac_f16 v5.h, v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x04,0x0b,0x6d,0x81,0x60,0x09,0x13]
+
+v_fmac_f16 v127.h, -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xfe,0xff,0x6d,0xff,0x6f,0xf5,0x30]
+
 v_fmac_f32 v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s
index 81fcb323e2711..05971722a7268 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s
@@ -114,15 +114,21 @@ v_cvt_pkrtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_pkrtz_f16_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: encoding: [0xe9,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00]
 
-v_fmac_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_fmac_f16 v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
 
-v_fmac_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_fmac_f16 v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: encoding: [0xea,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
 
-v_fmac_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+v_fmac_f16 v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: encoding: [0xe9,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00]
 
+v_fmac_f16 v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x04,0x0b,0x6d,0x81,0x77,0x39,0x05]
+
+v_fmac_f16 v127.h, v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xfe,0xff,0x6d,0xff,0x00,0x00,0x00]
+
 v_fmac_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s
index b339bc1960f3e..92729d4bca3ce 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s
@@ -28,50 +28,95 @@ v_add_f16_e32 v5, v1, v255
 v_add_f16_e32 v5, v255, v2
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
-v_fmaak_f16_e32 v255, v1, v2, 0xfe0b
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v255.h, v1.h, v2.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
 
-v_fmaak_f16_e32 v5, v1, v255, 0xfe0b
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v255.l, v1.l, v2.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
 
-v_fmaak_f16_e32 v5, v255, v2, 0xfe0b
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v5.h, v1.h, v255.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
 
-v_fmac_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v5.h, v255.h, v2.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
 
-v_fmac_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v5.l, v1.l, v255.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
 
-v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmaak_f16_e32 v5.l, v255.l, v2.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
 
-v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v255.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:16: error: invalid operand for instruction
 
-v_fmac_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v255.h, v1.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:16: error: invalid operand for instruction
 
-v_fmac_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v255.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:16: error: invalid operand for instruction
 
-v_fmac_f16_e32 v255, v1, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v255.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:16: error: invalid operand for instruction
 
-v_fmac_f16_e32 v5, v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.h, v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
 
-v_fmac_f16_e32 v5, v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.h, v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
 
-v_fmamk_f16_e32 v255, v1, 0xfe0b, v3
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.h, v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
 
-v_fmamk_f16_e32 v5, v1, 0xfe0b, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.h, v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
 
-v_fmamk_f16_e32 v5, v255, 0xfe0b, v3
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_fmac_f16_dpp v5.l, v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_fmac_f16_dpp v5.l, v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_fmac_f16_dpp v5.l, v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_fmac_f16_dpp v5.l, v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_fmac_f16_e32 v255.h, v1.h, v2.h
+// GFX12: :[[@LINE-1]]:16: error: invalid operand for instruction
+
+v_fmac_f16_e32 v255.l, v1.l, v2.l
+// GFX12: :[[@LINE-1]]:16: error: invalid operand for instruction
+
+v_fmac_f16_e32 v5.h, v1.h, v255.h
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_fmac_f16_e32 v5.h, v255.h, v2.h
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_fmac_f16_e32 v5.l, v1.l, v255.l
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_fmac_f16_e32 v5.l, v255.l, v2.l
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v255.h, v1.h, 0xfe0b, v3.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v255.l, v1.l, 0xfe0b, v3.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v5.h, v1.h, 0xfe0b, v255.h
+// GFX12: :[[@LINE-1]]:37: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v5.h, v255.h, 0xfe0b, v3.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v5.l, v1.l, 0xfe0b, v255.l
+// GFX12: :[[@LINE-1]]:37: error: invalid operand for instruction
+
+v_fmamk_f16_e32 v5.l, v255.l, 0xfe0b, v3.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_ldexp_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c-fake16.s
index 76db94023fc90..9c93c86135250 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c-fake16.s
@@ -1,7 +1,7 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
 
 v_cmp_class_f16_e64 s5, v1, v2
 // W32: encoding: [0x05,0x00,0x7d,0xd4,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
index e208b6cf903d3..e2e84f27b828a 100644
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX90A --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX10 --implicit-check-not=error: %s
 
diff --git a/llvm/test/MC/AMDGPU/gfx950-unsupported.s b/llvm/test/MC/AMDGPU/gfx950-unsupported.s
new file mode 100644
index 0000000000000..f8bbd40b700fd
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950-unsupported.s
@@ -0,0 +1,179 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_32x32x4_xf32
+//===----------------------------------------------------------------------===//
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[2:3], v[4:5], a[2:5]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], a[0:3], v[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[0:3], a[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[0:3], v[0:3], a[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], a[0:3], a[0:3], v[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[2:3], v[4:5], a[2:5]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], a[0:3], v[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[0:3], a[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[0:3], v[0:3], a[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], a[0:3], a[0:3], v[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_16x16x8_xf32
+//===----------------------------------------------------------------------===//
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
new file mode 100644
index 0000000000000..405d152c93d86
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 --strict-whitespace %s
+// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX940 --implicit-check-not=error: %s
+// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX90A --implicit-check-not=error: %s
+// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX10 --implicit-check-not=error: %s
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: global_load_lds_dwordx3 v[2:3], off     ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
+
+global_load_lds_dwordx3 v[2:3], off
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00]
+global_load_lds_dwordx3 v[2:3], off sc0 nt sc1
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
+global_load_lds_dwordx3 v[2:3], off offset:4
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00]
+global_load_lds_dwordx3 v2, s[4:5] offset:4
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: global_load_lds_dwordx4 v[2:3], off     ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
+global_load_lds_dwordx4 v[2:3], off
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00]
+global_load_lds_dwordx4 v[2:3], off sc0 nt sc1
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
+global_load_lds_dwordx4 v[2:3], off offset:4
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00]
+global_load_lds_dwordx4 v2, s[4:5] offset:4
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s
new file mode 100644
index 0000000000000..66dae85ee8e3e
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop1.s
@@ -0,0 +1,130 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck -check-prefix=GFX940-ERR --strict-whitespace  %s
+
+v_prng_b32 v5, v1
+// GFX950: v_prng_b32_e32 v5, v1                   ; encoding: [0x01,0xb1,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v255
+// GFX950: v_prng_b32_e32 v5, v255                 ; encoding: [0xff,0xb1,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, s1
+// GFX950: v_prng_b32_e32 v5, s1                   ; encoding: [0x01,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, s101
+// GFX950: v_prng_b32_e32 v5, s101                 ; encoding: [0x65,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, vcc_lo
+// GFX950: v_prng_b32_e32 v5, vcc_lo               ; encoding: [0x6a,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, vcc_hi
+// GFX950: v_prng_b32_e32 v5, vcc_hi               ; encoding: [0x6b,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, ttmp15
+// GFX950: v_prng_b32_e32 v5, ttmp15               ; encoding: [0x7b,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, m0
+// GFX950: v_prng_b32_e32 v5, m0                   ; encoding: [0x7c,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, exec_lo
+// GFX950: v_prng_b32_e32 v5, exec_lo              ; encoding: [0x7e,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, exec_hi
+// GFX950: v_prng_b32_e32 v5, exec_hi              ; encoding: [0x7f,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, -1
+// GFX950: v_prng_b32_e32 v5, -1                   ; encoding: [0xc1,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, 0.5
+// GFX950: v_prng_b32_e32 v5, 0.5                  ; encoding: [0xf0,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v5, src_scc
+// GFX950: v_prng_b32_e32 v5, src_scc              ; encoding: [0xfd,0xb0,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_prng_b32 v255, 0xaf123456
+// GFX950: v_prng_b32_e32 v255, 0xaf123456         ; encoding: [0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1
+// GFX950: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xb7,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v127
+// GFX950: v_cvt_f32_bf16_e32 v5, v127             ; encoding: [0x7f,0xb7,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, s1
+// GFX950: v_cvt_f32_bf16_e32 v5, s1               ; encoding: [0x01,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, vcc_lo
+// GFX950: v_cvt_f32_bf16_e32 v5, vcc_lo           ; encoding: [0x6a,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, vcc_hi
+// GFX950: v_cvt_f32_bf16_e32 v5, vcc_hi           ; encoding: [0x6b,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, ttmp15
+// GFX950: v_cvt_f32_bf16_e32 v5, ttmp15           ; encoding: [0x7b,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, m0
+// GFX950: v_cvt_f32_bf16_e32 v5, m0               ; encoding: [0x7c,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, exec_lo
+// GFX950: v_cvt_f32_bf16_e32 v5, exec_lo          ; encoding: [0x7e,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, exec_hi
+// GFX950: v_cvt_f32_bf16_e32 v5, exec_hi          ; encoding: [0x7f,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, -1
+// GFX950: v_cvt_f32_bf16_e32 v5, -1               ; encoding: [0xc1,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, 0.5
+// GFX950: v_cvt_f32_bf16_e32 v5, 0.5              ; encoding: [0xf0,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, src_scc
+// GFX950: v_cvt_f32_bf16_e32 v5, src_scc          ; encoding: [0xfd,0xb6,0x0a,0x7e]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v127, 0x8000
+// GFX950: v_cvt_f32_bf16_e32 v127, 0x8000         ; encoding: [0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, -v1
+// GFX950: v_cvt_f32_bf16_e64 v5, -v1              ; encoding: [0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, |v1|
+// GFX950: v_cvt_f32_bf16_e64 v5, |v1|             ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, -|v1|
+// GFX950: v_cvt_f32_bf16_e64 v5, -|v1|            ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16 v5, v1 clamp mul:2
+// GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp mul:2   ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x08]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64 v5, v1 clamp div:2
+// GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp div:2   ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s
new file mode 100644
index 0000000000000..301750689bc78
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s
@@ -0,0 +1,31 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefixes=GFX950 %s
+
+v_prng_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX950: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ;   encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_prng_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX950: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ;   encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_prng_b32 v5, v1 row_mirror
+// GFX950: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ;            encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_prng_b32 v5, v1 row_half_mirror
+// GFX950: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ;       encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_prng_b32 v5, v1 row_shl:1
+// GFX950: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ;             encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_prng_b32 v5, v1 row_shl:15
+// GFX950: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ;            encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_prng_b32 v5, v1 row_shr:1
+// GFX950: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ;             encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_prng_b32 v5, v1 row_shr:15
+// GFX950: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ;            encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_prng_b32 v5, v1 row_ror:1
+// GFX950: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ;             encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_prng_b32 v5, v1 row_ror:15
+// GFX950: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ;            encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
new file mode 100644
index 0000000000000..c9980f420b955
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck -check-prefix=GFX940-ERR --strict-whitespace  %s
+
+v_cvt_pk_bf16_f32 v5, v1, v2
+// GFX950: v_cvt_pk_bf16_f32 v5, v1, v2            ; encoding: [0x05,0x00,0x68,0xd2,0x01,0x05,0x02,0x00]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32 v5, v255, v255
+// GFX950: v_cvt_pk_bf16_f32 v5, v255, v255        ; encoding: [0x05,0x00,0x68,0xd2,0xff,0xff,0x03,0x00]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32 v5, v1, s2
+// GFX950: v_cvt_pk_bf16_f32 v5, v1, s2           ; encoding: [0x05,0x00,0x68,0xd2,0x01,0x05,0x00,0x00]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32 v5, m0, 0.5
+// GFX950: v_cvt_pk_bf16_f32 v5, m0, 0.5           ; encoding: [0x05,0x00,0x68,0xd2,0x7c,0xe0,0x01,0x00]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32 v5, -1, exec_hi
+// GFX950: v_cvt_pk_bf16_f32 v5, -1, exec_hi       ; encoding: [0x05,0x00,0x68,0xd2,0xc1,0xfe,0x00,0x00]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2
+// GFX950: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2     ; encoding: [0x05,0x00,0x68,0xd2,0xf0,0xf8,0x00,0x08]
+// GFX940-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt
new file mode 100644
index 0000000000000..0697ee8661e76
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx950 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX950 %s
+
+# GFX950: warning: invalid instruction encoding
+0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX950: warning: invalid instruction encoding
+0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX950: warning: invalid instruction encoding
+0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX950: warning: invalid instruction encoding
+0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04
\ No newline at end of file
diff --git a/llvm/test/MC/AMDGPU/mai-gfx950-err.s b/llvm/test/MC/AMDGPU/mai-gfx950-err.s
new file mode 100644
index 0000000000000..a6dff076392c8
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mai-gfx950-err.s
@@ -0,0 +1,31 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --implicit-check-not=error: %s
+
+v_mfma_ld_scale_b32 v0, 65
+// CHECK: :[[@LINE-1]]:25: error: literal operands are not supported
+
+v_mfma_ld_scale_b32 65, v0
+// CHECK: :[[@LINE-1]]:21: error: literal operands are not supported
+
+v_mfma_ld_scale_b32 65, 65
+// CHECK: :[[@LINE-1]]:25: error: literal operands are not supported
+
+v_mfma_ld_scale_b32 s0, s1
+// CHECK: :[[@LINE-1]]:25: error: invalid operand (violates constant bus restrictions)
+
+v_mfma_ld_scale_b32 v0, v0 clamp
+// CHECK: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_mfma_ld_scale_b32 v0, v0 neg_lo:[0,1]
+// CHECK: :[[@LINE-1]]:28: error: not a valid operand
+
+v_mfma_ld_scale_b32 v0, v0 neg_lo:[1,1]
+// CHECK: :[[@LINE-1]]:28: error: not a valid operand
+
+v_mfma_ld_scale_b32 v0, v0 neg_hi:[1,1]
+// CHECK: :[[@LINE-1]]:28: error: not a valid operand
+
+v_mfma_ld_scale_b32 v0, v0 neg_hi:[0,1]
+// CHECK: :[[@LINE-1]]:28: error: not a valid operand
+
+v_mfma_ld_scale_b32 v0, v0 neg_lo:[0,1] neg_hi:[0,1]
+// CHECK: :[[@LINE-1]]:28: error: not a valid operand
diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s
new file mode 100644
index 0000000000000..a692693638c69
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mai-gfx950.s
@@ -0,0 +1,277 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX950 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck -check-prefix=ERR %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+//===----------------------------------------------------------------------===//
+// MFMA opcodes.
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_16x16x32_f16
+//===----------------------------------------------------------------------===//
+
+// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3]
+
+// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32f16 v[0:3], v[0:3], v[0:3], v[0:3]
+
+// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3]
+
+// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32f16 a[0:3], a[0:3], a[0:3], a[0:3]
+
+// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], v[0:3], 1.0
+
+// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], a[0:3], 1.0
+
+// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+
+// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+
+// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+
+// GFX950:  v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 ; encoding: [0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+
+// GFX950:  v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+
+// GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7]
+
+// GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], a[0:3], v[4:7] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], a[0:3], v[4:7]
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_32x32x16_f16
+//===----------------------------------------------------------------------===//
+
+// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15]
+
+// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15]
+
+// GFX950:  v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16f16 v[0:15], v[0:3], v[0:3], v[0:15]
+
+// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16f16 a[0:15], a[0:3], a[0:3], a[0:15]
+
+// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], 1.0
+
+// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 1.0
+
+// GFX950:  v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5
+
+// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2
+
+// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3
+
+// GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1
+
+// GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_32x32x16_bf16
+//===----------------------------------------------------------------------===//
+
+// GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15]
+
+// GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15]
+
+// GFX950:  v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16bf16 v[0:15], v[0:3], v[0:3], v[0:15]
+
+// GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16bf16 a[0:15], a[0:3], a[0:3], a[0:15]
+
+// GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0xca,0x03]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], 1.0
+
+// GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0xca,0x1b]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], 1.0
+
+// GFX950:  v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0xa4]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5
+
+// GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x5c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2
+
+// GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xb7,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3
+
+// GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xb7,0xd3,0x00,0x01,0x02,0x04]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1
+
+// GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xb7,0xd3,0x00,0x01,0x02,0x1c]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1
+
+//===----------------------------------------------------------------------===//
+// v_mfma_ld_scale_b32
+//===----------------------------------------------------------------------===//
+
+// GFX950: v_mfma_ld_scale_b32 v0, 64             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x81,0x01,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v0, 64
+
+// GFX950: v_mfma_ld_scale_b32 64, v0             ; encoding: [0x00,0x40,0xac,0xd3,0xc0,0x00,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 64, v0
+
+// GFX950: v_mfma_ld_scale_b32 64, 64             ; encoding: [0x00,0x40,0xac,0xd3,0xc0,0x80,0x01,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 64, 64
+
+// GFX950: v_mfma_ld_scale_b32 s0, s0             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x00,0x00,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 s0, s0
+
+// GFX950: v_mfma_ld_scale_b32 s0, v0             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x00,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 s0, v0
+
+// GFX950: v_mfma_ld_scale_b32 v0, s0             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x01,0x00,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v0, s0
+
+// GFX950: v_mfma_ld_scale_b32 vcc_lo, vcc_lo     ; encoding: [0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 vcc_lo, vcc_lo
+
+// GFX950: v_mfma_ld_scale_b32 m0, m0             ; encoding: [0x00,0x40,0xac,0xd3,0x7c,0xf8,0x00,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 m0, m0
+
+// GFX950: v_mfma_ld_scale_b32 src_vccz, src_vccz ; encoding: [0x00,0x40,0xac,0xd3,0xfb,0xf6,0x01,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 vccz, vccz
+
+// GFX950: v_mfma_ld_scale_b32 src_execz, src_execz ; encoding: [0x00,0x40,0xac,0xd3,0xfc,0xf8,0x01,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 execz, execz
+
+// GFX950:  v_mfma_ld_scale_b32 v0, v0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x01,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v0, v0
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1
+
+// GFX950: v_mfma_ld_scale_b32 0, 0 ; encoding: [0x00,0x40,0xac,0xd3,0x80,0x00,0x01,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 0, 0
+
+// GFX950: v_mfma_ld_scale_b32 1, 0               ; encoding: [0x00,0x40,0xac,0xd3,0x81,0x00,0x01,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 1, 0
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[1, 0]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[0, 1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,1] ; encoding: [0x00,0x58,0xac,0xd3,0x01,0x03,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[1, 1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x08]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[1, 0]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x10]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0, 1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1             ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[1, 1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x00]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[0,0] op_sel_hi:[0,0]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x08]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[1,0]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x10]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[0,1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,1] ; encoding: [0x00,0x58,0xac,0xd3,0x01,0x03,0x02,0x18]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[1,1] op_sel_hi:[1,1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x10]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[0,1]
+
+// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x08]
+// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,0]
+
diff --git a/llvm/test/MC/AMDGPU/mubuf-gfx950.s b/llvm/test/MC/AMDGPU/mubuf-gfx950.s
new file mode 100644
index 0000000000000..0ba6f2ca4f6c4
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mubuf-gfx950.s
@@ -0,0 +1,32 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX950 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERR %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx803 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERR %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1030 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+// FIXME: Bad diagnostics on unsupported subtarget
+
+// GFX950: buffer_load_dwordx3 off, s[8:11], s3 lds ; encoding: [0x00,0x00,0x59,0xe0,0x00,0x00,0x02,0x03]
+// ERR: :[[@LINE+1]]:21: error: invalid operand for instruction
+buffer_load_dwordx3 off, s[8:11], s3 lds
+
+// GFX950: buffer_load_dwordx3 off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x59,0xe0,0x00,0x00,0x02,0x03]
+// ERR: :[[@LINE+1]]:38: error: not a valid operand
+buffer_load_dwordx3 off, s[8:11], s3 offset:4095 lds
+
+// GFX950: buffer_load_dwordx3 v0, s[8:11], s101 offen lds ; encoding: [0x00,0x10,0x59,0xe0,0x00,0x00,0x02,0x65]
+// ERR: :[[@LINE+1]]:39: error: invalid operand for instruction
+buffer_load_dwordx3 v0, s[8:11], s101 offen lds
+
+
+
+// GFX950: buffer_load_dwordx4 off, s[8:11], s3 lds ; encoding: [0x00,0x00,0x5d,0xe0,0x00,0x00,0x02,0x03]
+// ERR: :[[@LINE+1]]:21: error: invalid operand for instruction
+buffer_load_dwordx4 off, s[8:11], s3 lds
+
+// GFX950: buffer_load_dwordx4 off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x5d,0xe0,0x00,0x00,0x02,0x03]
+// ERR: :[[@LINE+1]]:38: error: not a valid operand
+buffer_load_dwordx4 off, s[8:11], s3 offset:4095 lds
+
+// GFX950: buffer_load_dwordx4 v0, s[8:11], s101 offen lds ; encoding: [0x00,0x10,0x5d,0xe0,0x00,0x00,0x02,0x65]
+// ERR: :[[@LINE+1]]:39: error: invalid operand for instruction
+buffer_load_dwordx4 v0, s[8:11], s101 offen lds
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
index 4f638cd8ff54f..372721a17d4d7 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
@@ -469,49 +469,70 @@
 # GFX11: v_dot2acc_f32_f16 v255, 0xfe0b, v255    ; encoding: [0xff,0xfe,0xff,0x05,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, v1, v2, 0xfe0b          ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, v1.l, v2.l, 0xfe0b    ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, v1, v2, 0xfe0b          ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, v127, v2, 0xfe0b        ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, v127.l, v2.l, 0xfe0b  ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, v127, v2, 0xfe0b        ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmaak_f16 v5.l, v1.h, v2.l, 0xfe0b    ; encoding: [0x81,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2, 0xfe0b ; encoding: [0x81,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+
+0xff,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmaak_f16 v5.l, v127.h, v2.l, 0xfe0b  ; encoding: [0xff,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2, 0xfe0b ; encoding: [0xff,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, s1, v2, 0xfe0b          ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, s1, v2.l, 0xfe0b      ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, s1, v2, 0xfe0b          ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, s105, v2, 0xfe0b        ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, s105, v2.l, 0xfe0b    ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, s105, v2, 0xfe0b        ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b      ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, vcc_lo, v2.l, 0xfe0b  ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b      ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b      ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, vcc_hi, v2.l, 0xfe0b  ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b      ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, ttmp15, v2, 0xfe0b      ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, ttmp15, v2.l, 0xfe0b  ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, ttmp15, v2, 0xfe0b      ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, m0, v2, 0xfe0b          ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, m0, v2.l, 0xfe0b      ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, m0, v2, 0xfe0b          ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, exec_lo, v2, 0xfe0b     ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, exec_lo, v2.l, 0xfe0b ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, exec_lo, v2, 0xfe0b     ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, exec_hi, v2, 0xfe0b     ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, exec_hi, v2.l, 0xfe0b ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, exec_hi, v2, 0xfe0b     ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, null, v2, 0xfe0b        ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, null, v2.l, 0xfe0b    ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, null, v2, 0xfe0b        ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, -1, v2, 0xfe0b          ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmaak_f16 v5.l, -1, v2.l, 0xfe0b      ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v5, -1, v2, 0xfe0b          ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
-0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, 0.5, v2, 0xfe0b         ; encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+0xf0,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmaak_f16 v127.l, 0.5, v127.l, 0xfe0b ; encoding: [0xf0,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmaak_f16 v127, 0.5, v127, 0xfe0b     ; encoding: [0xf0,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
 
-0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v5, src_scc, v2, 0xfe0b     ; encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+0xfd,0x04,0x0b,0x71,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmaak_f16 v5.h, src_scc, v2.h, 0xfe0b ; encoding: [0xfd,0x04,0x0b,0x71,0x0b,0xfe,0x00,0x00]
 
-0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b  ; encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
+0xff,0xfe,0xff,0x71,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmaak_f16 v127.h, 0xfe0b, v127.h, 0xfe0b ; encoding: [0xff,0xfe,0xff,0x71,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf
 # GFX11: v_fmaak_f32 v5, v1, v2, 0xaf123456      ; encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf]
@@ -604,49 +625,70 @@
 # GFX11: v_fmac_dx9_zero_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, v1, v2               ; encoding: [0x01,0x05,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, v1.l, v2.l         ; encoding: [0x01,0x05,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, v1, v2               ; encoding: [0x01,0x05,0x0a,0x6c]
 
 0x7f,0x05,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, v127, v2             ; encoding: [0x7f,0x05,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, v127.l, v2.l       ; encoding: [0x7f,0x05,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, v127, v2             ; encoding: [0x7f,0x05,0x0a,0x6c]
+
+0x81,0x05,0x0a,0x6c
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, v1.h, v2.l         ; encoding: [0x81,0x05,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x6c]
+
+0xff,0x05,0x0a,0x6c
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, v127.h, v2.l       ; encoding: [0xff,0x05,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x6c]
 
 0x01,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, s1, v2               ; encoding: [0x01,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, s1, v2.l           ; encoding: [0x01,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, s1, v2               ; encoding: [0x01,0x04,0x0a,0x6c]
 
 0x69,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, s105, v2             ; encoding: [0x69,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, s105, v2.l         ; encoding: [0x69,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, s105, v2             ; encoding: [0x69,0x04,0x0a,0x6c]
 
 0x6a,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, vcc_lo, v2           ; encoding: [0x6a,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, vcc_lo, v2.l       ; encoding: [0x6a,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, vcc_lo, v2           ; encoding: [0x6a,0x04,0x0a,0x6c]
 
 0x6b,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, vcc_hi, v2           ; encoding: [0x6b,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, vcc_hi, v2.l       ; encoding: [0x6b,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, vcc_hi, v2           ; encoding: [0x6b,0x04,0x0a,0x6c]
 
 0x7b,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, ttmp15, v2           ; encoding: [0x7b,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, ttmp15, v2.l       ; encoding: [0x7b,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, ttmp15, v2           ; encoding: [0x7b,0x04,0x0a,0x6c]
 
 0x7d,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, m0, v2               ; encoding: [0x7d,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, m0, v2.l           ; encoding: [0x7d,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, m0, v2               ; encoding: [0x7d,0x04,0x0a,0x6c]
 
 0x7e,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, exec_lo, v2          ; encoding: [0x7e,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, exec_lo, v2.l      ; encoding: [0x7e,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, exec_lo, v2          ; encoding: [0x7e,0x04,0x0a,0x6c]
 
 0x7f,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, exec_hi, v2          ; encoding: [0x7f,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, exec_hi, v2.l      ; encoding: [0x7f,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, exec_hi, v2          ; encoding: [0x7f,0x04,0x0a,0x6c]
 
 0x7c,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, null, v2             ; encoding: [0x7c,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, null, v2.l         ; encoding: [0x7c,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, null, v2             ; encoding: [0x7c,0x04,0x0a,0x6c]
 
 0xc1,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, -1, v2               ; encoding: [0xc1,0x04,0x0a,0x6c]
+# GFX11-REAL16: v_fmac_f16_e32 v5.l, -1, v2.l           ; encoding: [0xc1,0x04,0x0a,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v5, -1, v2               ; encoding: [0xc1,0x04,0x0a,0x6c]
 
-0xf0,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, 0.5, v2              ; encoding: [0xf0,0x04,0x0a,0x6c]
+0xf0,0xfe,0xfe,0x6c
+# GFX11-REAL16: v_fmac_f16_e32 v127.l, 0.5, v127.l      ; encoding: [0xf0,0xfe,0xfe,0x6c]
+# GFX11-FAKE16: v_fmac_f16_e32 v127, 0.5, v127          ; encoding: [0xf0,0xfe,0xfe,0x6c]
 
-0xfd,0x04,0x0a,0x6c
-# GFX11: v_fmac_f16_e32 v5, src_scc, v2          ; encoding: [0xfd,0x04,0x0a,0x6c]
+0xfd,0x04,0x0b,0x6d
+# GFX11-REAL16: v_fmac_f16_e32 v5.h, src_scc, v2.h      ; encoding: [0xfd,0x04,0x0b,0x6d]
 
-0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmac_f16_e32 v127, 0xfe0b, v127       ; encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00]
+0xff,0xfe,0xff,0x6d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmac_f16_e32 v127.h, 0xfe0b, v127.h   ; encoding: [0xff,0xfe,0xff,0x6d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x0a,0x56
 # GFX11: v_fmac_f32_e32 v5, v1, v2               ; encoding: [0x01,0x05,0x0a,0x56]
@@ -694,49 +736,70 @@
 # GFX11: v_fmac_f32_e32 v255, 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf]
 
 0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, v1, 0xfe0b, v3          ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, v1.l, 0xfe0b, v3.l    ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, v1, 0xfe0b, v3          ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, v127, 0xfe0b, v3        ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, v127.l, 0xfe0b, v3.l  ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, v127, 0xfe0b, v3        ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+
+0x81,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmamk_f16 v5.l, v1.h, 0xfe0b, v3.l    ; encoding: [0x81,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, 0xfe0b, v3 ; encoding: [0x81,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+
+0xff,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmamk_f16 v5.l, v127.h, 0xfe0b, v3.l  ; encoding: [0xff,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, 0xfe0b, v3 ; encoding: [0xff,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, s1, 0xfe0b, v3          ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, s1, 0xfe0b, v3.l      ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, s1, 0xfe0b, v3          ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, s105, 0xfe0b, v3        ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, s105, 0xfe0b, v3.l    ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, s105, 0xfe0b, v3        ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3      ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, vcc_lo, 0xfe0b, v3.l  ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3      ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3      ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, vcc_hi, 0xfe0b, v3.l  ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3      ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, ttmp15, 0xfe0b, v3      ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, ttmp15, 0xfe0b, v3.l  ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, ttmp15, 0xfe0b, v3      ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, m0, 0xfe0b, v3          ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, m0, 0xfe0b, v3.l      ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, m0, 0xfe0b, v3          ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, exec_lo, 0xfe0b, v3     ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, exec_lo, 0xfe0b, v3.l ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, exec_lo, 0xfe0b, v3     ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, exec_hi, 0xfe0b, v3     ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, exec_hi, 0xfe0b, v3.l ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, exec_hi, 0xfe0b, v3     ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, null, 0xfe0b, v3        ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, null, 0xfe0b, v3.l    ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, null, 0xfe0b, v3        ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, -1, 0xfe0b, v3          ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fmamk_f16 v5.l, -1, 0xfe0b, v3.l      ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v5, -1, 0xfe0b, v3          ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
-0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, 0.5, 0xfe0b, v3         ; encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+0xf0,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmamk_f16 v127.l, 0.5, 0xfe0b, v127.l ; encoding: [0xf0,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fmamk_f16 v127, 0.5, 0xfe0b, v127     ; encoding: [0xf0,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
 
-0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v5, src_scc, 0xfe0b, v3     ; encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+0xfd,0x06,0x0b,0x6f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmamk_f16 v5.h, src_scc, 0xfe0b, v3.h ; encoding: [0xfd,0x06,0x0b,0x6f,0x0b,0xfe,0x00,0x00]
 
-0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127  ; encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
+0xff,0xfe,0xff,0x6f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fmamk_f16 v127.h, 0xfe0b, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x6f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf
 # GFX11: v_fmamk_f32 v5, v1, 0xaf123456, v3      ; encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt
index a8a40f883cc48..84465624da6ba 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt
@@ -437,46 +437,60 @@
 # GFX11: v_dot2acc_f32_f16_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x05,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff]
 
-0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01]
+0xfa,0xfe,0xfe,0x6c,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_fmac_f16_dpp v127.l, v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fmac_f16_dpp v127, v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x5f,0x01,0x01]
 
-0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13
-# GFX11: v_fmac_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13]
+0xfa,0x04,0x0b,0x6d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_fmac_f16_dpp v5.h, v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x6d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
 
-0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30
-# GFX11: v_fmac_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30]
+0xfa,0xfe,0xff,0x6d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_fmac_f16_dpp v127.h, -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x6d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v126, v255, v183      ; encoding: [0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff
 # GFX11: v_fmac_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt
index a1d2c34f09f2b..3d4a16d41880a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt
@@ -65,10 +65,19 @@
 # GFX11: v_dot2acc_f32_f16_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x05,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05
-# GFX11: v_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
 
-0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00
-# GFX11: v_fmac_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00]
+0xe9,0xfe,0xfe,0x6c,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_fmac_f16_dpp v127.l, v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0xfe,0x6c,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fmac_f16_dpp v127, v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0xfe,0x6c,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x0b,0x6d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_fmac_f16_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x6d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0xff,0x6d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_fmac_f16_dpp v127.h, v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x6d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05
 # GFX11: v_fmac_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt
index 1276d898160b3..b120d6a030cc9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt
@@ -446,49 +446,78 @@
 # GFX12: v_cvt_pk_rtz_f16_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, v1, v2, 0xfe0b          ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, v1.l, v2.l, 0xfe0b    ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, v1, v2, 0xfe0b          ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, v127, v2, 0xfe0b        ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, v127.l, v2.l, 0xfe0b  ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, v127, v2, 0xfe0b        ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, s1, v2, 0xfe0b          ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, s1, v2.l, 0xfe0b      ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, s1, v2, 0xfe0b          ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, s105, v2, 0xfe0b        ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, s105, v2.l, 0xfe0b    ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, s105, v2, 0xfe0b        ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b      ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, vcc_lo, v2.l, 0xfe0b  ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b      ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b      ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, vcc_hi, v2.l, 0xfe0b  ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b      ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, ttmp15, v2, 0xfe0b      ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, ttmp15, v2.l, 0xfe0b  ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, ttmp15, v2, 0xfe0b      ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, m0, v2, 0xfe0b          ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, m0, v2.l, 0xfe0b      ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, m0, v2, 0xfe0b          ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, exec_lo, v2, 0xfe0b     ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, exec_lo, v2.l, 0xfe0b ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, exec_lo, v2, 0xfe0b     ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, exec_hi, v2, 0xfe0b     ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, exec_hi, v2.l, 0xfe0b ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, exec_hi, v2, 0xfe0b     ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, null, v2, 0xfe0b        ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, null, v2.l, 0xfe0b    ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, null, v2, 0xfe0b        ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, -1, v2, 0xfe0b          ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, -1, v2.l, 0xfe0b      ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, -1, v2, 0xfe0b          ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, 0.5, v2, 0xfe0b         ; encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, 0.5, v2.l, 0xfe0b     ; encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, 0.5, v2, 0xfe0b         ; encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v5, src_scc, v2, 0xfe0b     ; encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v5.l, src_scc, v2.l, 0xfe0b ; encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, src_scc, v2, 0xfe0b     ; encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00]
 
 0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b  ; encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmaak_f16 v127.l, 0xfe0b, v127.l, 0xfe0b ; encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b  ; encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmaak_f16 v5.l, v1.h, v2.l, 0xfe0b    ; encoding: [0x81,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2, 0xfe0b ; encoding: [0x81,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+
+0xff,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmaak_f16 v5.l, v127.h, v2.l, 0xfe0b  ; encoding: [0xff,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmaak_f16 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2, 0xfe0b ; encoding: [0xff,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00]
+
+0xfd,0x04,0x0b,0x71,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmaak_f16 v5.h, src_scc, v2.h, 0xfe0b ; encoding: [0xfd,0x04,0x0b,0x71,0x0b,0xfe,0x00,0x00]
+
+0xff,0xfe,0xff,0x71,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmaak_f16 v127.h, 0xfe0b, v127.h, 0xfe0b ; encoding: [0xff,0xfe,0xff,0x71,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf
 # GFX12: v_fmaak_f32 v5, v1, v2, 0xaf123456      ; encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf]
@@ -536,49 +565,78 @@
 # GFX12: v_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 ; encoding: [0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, v1, v2               ; encoding: [0x01,0x05,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, v1.l, v2.l         ; encoding: [0x01,0x05,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, v1, v2               ; encoding: [0x01,0x05,0x0a,0x6c]
 
 0x7f,0x05,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, v127, v2             ; encoding: [0x7f,0x05,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, v127.l, v2.l       ; encoding: [0x7f,0x05,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, v127, v2             ; encoding: [0x7f,0x05,0x0a,0x6c]
 
 0x01,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, s1, v2               ; encoding: [0x01,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, s1, v2.l           ; encoding: [0x01,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, s1, v2               ; encoding: [0x01,0x04,0x0a,0x6c]
 
 0x69,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, s105, v2             ; encoding: [0x69,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, s105, v2.l         ; encoding: [0x69,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, s105, v2             ; encoding: [0x69,0x04,0x0a,0x6c]
 
 0x6a,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, vcc_lo, v2           ; encoding: [0x6a,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, vcc_lo, v2.l       ; encoding: [0x6a,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, vcc_lo, v2           ; encoding: [0x6a,0x04,0x0a,0x6c]
 
 0x6b,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, vcc_hi, v2           ; encoding: [0x6b,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, vcc_hi, v2.l       ; encoding: [0x6b,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, vcc_hi, v2           ; encoding: [0x6b,0x04,0x0a,0x6c]
 
 0x7b,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, ttmp15, v2           ; encoding: [0x7b,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, ttmp15, v2.l       ; encoding: [0x7b,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, ttmp15, v2           ; encoding: [0x7b,0x04,0x0a,0x6c]
 
 0x7d,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, m0, v2               ; encoding: [0x7d,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, m0, v2.l           ; encoding: [0x7d,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, m0, v2               ; encoding: [0x7d,0x04,0x0a,0x6c]
 
 0x7e,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, exec_lo, v2          ; encoding: [0x7e,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, exec_lo, v2.l      ; encoding: [0x7e,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, exec_lo, v2          ; encoding: [0x7e,0x04,0x0a,0x6c]
 
 0x7f,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, exec_hi, v2          ; encoding: [0x7f,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, exec_hi, v2.l      ; encoding: [0x7f,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, exec_hi, v2          ; encoding: [0x7f,0x04,0x0a,0x6c]
 
 0x7c,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, null, v2             ; encoding: [0x7c,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, null, v2.l         ; encoding: [0x7c,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, null, v2             ; encoding: [0x7c,0x04,0x0a,0x6c]
 
 0xc1,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, -1, v2               ; encoding: [0xc1,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, -1, v2.l           ; encoding: [0xc1,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, -1, v2               ; encoding: [0xc1,0x04,0x0a,0x6c]
 
 0xf0,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, 0.5, v2              ; encoding: [0xf0,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, 0.5, v2.l          ; encoding: [0xf0,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, 0.5, v2              ; encoding: [0xf0,0x04,0x0a,0x6c]
 
 0xfd,0x04,0x0a,0x6c
-# GFX12: v_fmac_f16_e32 v5, src_scc, v2          ; encoding: [0xfd,0x04,0x0a,0x6c]
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, src_scc, v2.l      ; encoding: [0xfd,0x04,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, src_scc, v2          ; encoding: [0xfd,0x04,0x0a,0x6c]
 
 0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmac_f16_e32 v127, 0xfe0b, v127       ; encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmac_f16_e32 v127.l, 0xfe0b, v127.l   ; encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmac_f16_e32 v127, 0xfe0b, v127       ; encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0a,0x6c
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, v1.h, v2.l         ; encoding: [0x81,0x05,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x6c]
+
+0xff,0x05,0x0a,0x6c
+# GFX12-REAL16: v_fmac_f16_e32 v5.l, v127.h, v2.l       ; encoding: [0xff,0x05,0x0a,0x6c]
+# GFX12-FAKE16: v_fmac_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x6c]
+
+0xfd,0x04,0x0b,0x6d
+# GFX12-REAL16: v_fmac_f16_e32 v5.h, src_scc, v2.h      ; encoding: [0xfd,0x04,0x0b,0x6d]
+
+0xff,0xfe,0xff,0x6d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmac_f16_e32 v127.h, 0xfe0b, v127.h   ; encoding: [0xff,0xfe,0xff,0x6d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x0a,0x56
 # GFX12: v_fmac_f32_e32 v5, v1, v2               ; encoding: [0x01,0x05,0x0a,0x56]
@@ -626,49 +684,78 @@
 # GFX12: v_fmac_f32_e32 v255, 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf]
 
 0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, v1, 0xfe0b, v3          ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, v1.l, 0xfe0b, v3.l    ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, v1, 0xfe0b, v3          ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, v127, 0xfe0b, v3        ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, v127.l, 0xfe0b, v3.l  ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, v127, 0xfe0b, v3        ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, s1, 0xfe0b, v3          ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, s1, 0xfe0b, v3.l      ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, s1, 0xfe0b, v3          ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, s105, 0xfe0b, v3        ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, s105, 0xfe0b, v3.l    ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, s105, 0xfe0b, v3        ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3      ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, vcc_lo, 0xfe0b, v3.l  ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3      ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3      ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, vcc_hi, 0xfe0b, v3.l  ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3      ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, ttmp15, 0xfe0b, v3      ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, ttmp15, 0xfe0b, v3.l  ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, ttmp15, 0xfe0b, v3      ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, m0, 0xfe0b, v3          ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, m0, 0xfe0b, v3.l      ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, m0, 0xfe0b, v3          ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, exec_lo, 0xfe0b, v3     ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, exec_lo, 0xfe0b, v3.l ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, exec_lo, 0xfe0b, v3     ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, exec_hi, 0xfe0b, v3     ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, exec_hi, 0xfe0b, v3.l ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, exec_hi, 0xfe0b, v3     ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, null, 0xfe0b, v3        ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, null, 0xfe0b, v3.l    ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, null, 0xfe0b, v3        ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, -1, 0xfe0b, v3          ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, -1, 0xfe0b, v3.l      ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, -1, 0xfe0b, v3          ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, 0.5, 0xfe0b, v3         ; encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, 0.5, 0xfe0b, v3.l     ; encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, 0.5, 0xfe0b, v3         ; encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v5, src_scc, 0xfe0b, v3     ; encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v5.l, src_scc, 0xfe0b, v3.l ; encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, src_scc, 0xfe0b, v3     ; encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
 
 0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00
-# GFX12: v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127  ; encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fmamk_f16 v127.l, 0xfe0b, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127  ; encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00]
+
+0x81,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmamk_f16 v5.l, v1.h, 0xfe0b, v3.l    ; encoding: [0x81,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, 0xfe0b, v3 ; encoding: [0x81,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+
+0xff,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmamk_f16 v5.l, v127.h, 0xfe0b, v3.l  ; encoding: [0xff,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fmamk_f16 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, 0xfe0b, v3 ; encoding: [0xff,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00]
+
+0xfd,0x06,0x0b,0x6f,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmamk_f16 v5.h, src_scc, 0xfe0b, v3.h ; encoding: [0xfd,0x06,0x0b,0x6f,0x0b,0xfe,0x00,0x00]
+
+0xff,0xfe,0xff,0x6f,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fmamk_f16 v127.h, 0xfe0b, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x6f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf
 # GFX12: v_fmamk_f32 v5, v1, 0xaf123456, v3      ; encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt
index 551fb0d311188..9a9a1eb2cf959 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt
@@ -383,46 +383,68 @@
 # GFX12: v_cvt_pk_rtz_f16_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13
-# GFX12: v_fmac_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30
-# GFX12: v_fmac_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_fmac_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_fmac_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x0b,0x6d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_fmac_f16_dpp v5.h, v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x6d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0xff,0x6d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_fmac_f16_dpp v127.h, -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x6d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v126, v255, v183      ; encoding: [0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff
 # GFX12: v_fmac_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt
index bbf494c153fd3..0ef2c082f7dda 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt
@@ -59,10 +59,19 @@
 # GFX12: v_cvt_pk_rtz_f16_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05
-# GFX12: v_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fmac_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00
-# GFX12: v_fmac_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_fmac_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fmac_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x0b,0x6d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_fmac_f16_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x6d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0xff,0x6d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_fmac_f16_dpp v127.h, v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x6d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05
 # GFX12: v_fmac_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt
index 9575e50f16312..63e425fdb4ec9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s
 
 # GFX940: global_load_dword v2, v[2:3], off sc0   ; encoding: [0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02]
 0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt
new file mode 100644
index 0000000000000..ce37e228f03fa
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt
@@ -0,0 +1,44 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s
+
+# GFX950: 	global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00]
+0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00
+
+# GFX950: 	global_load_lds_dwordx3 v[2:3], off     ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
+0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00
+
+# GFX950: 	global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
+0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00
+
+# GFX950: 	global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00]
+0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00
+
+# GFX950: 	global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00]
+0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00
+
+# GFX950: 	global_load_lds_dwordx4 v[2:3], off     ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
+0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00
+
+# GFX950: 	global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
+0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00
+
+# GFX950: 	global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00]
+0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00
+
+
+# GFX950: 	buffer_load_dwordx3 off, s[8:11], s3 lds ; encoding: [0x00,0x00,0x59,0xe0,0x00,0x00,0x02,0x03]
+0x00,0x00,0x59,0xe0,0x00,0x00,0x02,0x03
+
+# GFX950: 	buffer_load_dwordx3 off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x59,0xe0,0x00,0x00,0x02,0x03]
+0xff,0x0f,0x59,0xe0,0x00,0x00,0x02,0x03
+
+# GFX950: 	buffer_load_dwordx3 v0, s[8:11], s101 offen lds ; encoding: [0x00,0x10,0x59,0xe0,0x00,0x00,0x02,0x65]
+0x00,0x10,0x59,0xe0,0x00,0x00,0x02,0x65
+
+# GFX950: 	buffer_load_dwordx4 off, s[8:11], s3 lds ; encoding: [0x00,0x00,0x5d,0xe0,0x00,0x00,0x02,0x03]
+0x00,0x00,0x5d,0xe0,0x00,0x00,0x02,0x03
+
+# GFX950: 	buffer_load_dwordx4 off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x5d,0xe0,0x00,0x00,0x02,0x03]
+0xff,0x0f,0x5d,0xe0,0x00,0x00,0x02,0x03
+
+# GFX950: 	buffer_load_dwordx4 v0, s[8:11], s101 offen lds ; encoding: [0x00,0x10,0x5d,0xe0,0x00,0x00,0x02,0x65]
+0x00,0x10,0x5d,0xe0,0x00,0x00,0x02,0x65
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt
new file mode 100644
index 0000000000000..336a26907891a
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt
@@ -0,0 +1,151 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s
+
+# GFX950: v_prng_b32_e32 v5, v1                   ; encoding: [0x01,0xb1,0x0a,0x7e]
+0x01,0xb1,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, v255                 ; encoding: [0xff,0xb1,0x0a,0x7e]
+0xff,0xb1,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, s1                   ; encoding: [0x01,0xb0,0x0a,0x7e]
+0x01,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, s101                 ; encoding: [0x65,0xb0,0x0a,0x7e]
+0x65,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, vcc_lo               ; encoding: [0x6a,0xb0,0x0a,0x7e]
+0x6a,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, vcc_hi               ; encoding: [0x6b,0xb0,0x0a,0x7e]
+0x6b,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, ttmp15               ; encoding: [0x7b,0xb0,0x0a,0x7e]
+0x7b,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, m0                   ; encoding: [0x7c,0xb0,0x0a,0x7e]
+0x7c,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, exec_lo              ; encoding: [0x7e,0xb0,0x0a,0x7e]
+0x7e,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, exec_hi              ; encoding: [0x7f,0xb0,0x0a,0x7e]
+0x7f,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, -1                   ; encoding: [0xc1,0xb0,0x0a,0x7e]
+0xc1,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, 0.5                  ; encoding: [0xf0,0xb0,0x0a,0x7e]
+0xf0,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v5, src_scc              ; encoding: [0xfd,0xb0,0x0a,0x7e]
+0xfd,0xb0,0x0a,0x7e
+
+# GFX950: v_prng_b32_e32 v255, 0xaf123456         ; encoding: [0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+0xff,0xb0,0xfe,0x7f,0x56,0x34,0x12,0xaf
+
+# GFX950: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xb7,0x0a,0x7e]
+0x01,0xb7,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, v127             ; encoding: [0x7f,0xb7,0x0a,0x7e]
+0x7f,0xb7,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, s1               ; encoding: [0x01,0xb6,0x0a,0x7e]
+0x01,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, vcc_lo           ; encoding: [0x6a,0xb6,0x0a,0x7e]
+0x6a,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, vcc_hi           ; encoding: [0x6b,0xb6,0x0a,0x7e]
+0x6b,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, ttmp15           ; encoding: [0x7b,0xb6,0x0a,0x7e]
+0x7b,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, m0               ; encoding: [0x7c,0xb6,0x0a,0x7e]
+0x7c,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, exec_lo          ; encoding: [0x7e,0xb6,0x0a,0x7e]
+0x7e,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, exec_hi          ; encoding: [0x7f,0xb6,0x0a,0x7e]
+0x7f,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, -1               ; encoding: [0xc1,0xb6,0x0a,0x7e]
+0xc1,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, 0.5              ; encoding: [0xf0,0xb6,0x0a,0x7e]
+0xf0,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, src_scc          ; encoding: [0xfd,0xb6,0x0a,0x7e]
+0xfd,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v127, 0x8000         ; encoding: [0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00
+
+# GFX950: v_cvt_f32_bf16_e64 v5, -v1              ; encoding: [0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20]
+0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20
+
+# GFX950: v_cvt_f32_bf16_e64 v5, |v1|             ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00]
+0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00
+
+# GFX950: v_cvt_f32_bf16_e64 v5, -|v1|            ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20]
+0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20
+
+# GFX950: v_cvt_f32_bf16_e64 v5, 0.5 clamp mul:2  ; encoding: [0x05,0x80,0x9b,0xd1,0xf0,0x00,0x00,0x08]
+0x05,0x80,0x9b,0xd1,0xf0,0x00,0x00,0x08
+
+# GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp div:2   ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18]
+0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18
+
+# GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp div:2   ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18]
+0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x18
+
+# GFX950: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xb7,0x0a,0x7e]
+0x01,0xb7,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, v127             ; encoding: [0x7f,0xb7,0x0a,0x7e]
+0x7f,0xb7,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, s1               ; encoding: [0x01,0xb6,0x0a,0x7e]
+0x01,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, vcc_lo           ; encoding: [0x6a,0xb6,0x0a,0x7e]
+0x6a,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, vcc_hi           ; encoding: [0x6b,0xb6,0x0a,0x7e]
+0x6b,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, ttmp15           ; encoding: [0x7b,0xb6,0x0a,0x7e]
+0x7b,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, m0               ; encoding: [0x7c,0xb6,0x0a,0x7e]
+0x7c,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, exec_lo          ; encoding: [0x7e,0xb6,0x0a,0x7e]
+0x7e,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, exec_hi          ; encoding: [0x7f,0xb6,0x0a,0x7e]
+0x7f,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, -1               ; encoding: [0xc1,0xb6,0x0a,0x7e]
+0xc1,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, 0.5              ; encoding: [0xf0,0xb6,0x0a,0x7e]
+0xf0,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v5, src_scc          ; encoding: [0xfd,0xb6,0x0a,0x7e]
+0xfd,0xb6,0x0a,0x7e
+
+# GFX950: v_cvt_f32_bf16_e32 v127, 0x8000         ; encoding: [0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+0xff,0xb6,0xfe,0x7e,0x00,0x80,0x00,0x00
+
+# GFX950: v_cvt_f32_bf16_e64 v5, -v1              ; encoding: [0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20]
+0x05,0x00,0x9b,0xd1,0x01,0x01,0x00,0x20
+
+# GFX950: v_cvt_f32_bf16_e64 v5, |v1|             ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00]
+0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x00
+
+# GFX950: v_cvt_f32_bf16_e64 v5, -|v1|            ; encoding: [0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20]
+0x05,0x01,0x9b,0xd1,0x01,0x01,0x00,0x20
+
+# GFX950: v_cvt_f32_bf16_e64 v5, v1 clamp mul:2   ; encoding: [0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x08]
+0x05,0x80,0x9b,0xd1,0x01,0x01,0x00,0x08
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
new file mode 100644
index 0000000000000..909743c2babf5
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
@@ -0,0 +1,19 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s
+
+# GFX950: v_cvt_pk_bf16_f32 v5, v1, v2            ; encoding: [0x05,0x00,0x68,0xd2,0x01,0x05,0x02,0x00]
+0x05,0x00,0x68,0xd2,0x01,0x05,0x02,0x00
+
+# GFX950: v_cvt_pk_bf16_f32 v5, v255, v255        ; encoding: [0x05,0x00,0x68,0xd2,0xff,0xff,0x03,0x00]
+0x05,0x00,0x68,0xd2,0xff,0xff,0x03,0x00
+
+# GFX950: v_cvt_pk_bf16_f32 v5, v1, s2            ; encoding: [0x05,0x00,0x68,0xd2,0x01,0x05,0x00,0x00]
+0x05,0x00,0x68,0xd2,0x01,0x05,0x00,0x00
+
+# GFX950: v_cvt_pk_bf16_f32 v5, m0, 0.5           ; encoding: [0x05,0x00,0x68,0xd2,0x7c,0xe0,0x01,0x00]
+0x05,0x00,0x68,0xd2,0x7c,0xe0,0x01,0x00
+
+# GFX950: v_cvt_pk_bf16_f32 v5, -1, exec_hi       ; encoding: [0x05,0x00,0x68,0xd2,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x68,0xd2,0xc1,0xfe,0x00,0x00
+
+# GFX950: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2     ; encoding: [0x05,0x00,0x68,0xd2,0xf0,0xf8,0x00,0x08]
+0x05,0x00,0x68,0xd2,0xf0,0xf8,0x00,0x08
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
new file mode 100644
index 0000000000000..1fa48fca80fb4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt
@@ -0,0 +1,161 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s
+
+# GFX950:   v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c
+
+# GFX950:   v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 ; encoding: [0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c
+
+# GFX950:   v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c]
+0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c
+
+# GFX950:   v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c
+
+# GFX950:   v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c]
+0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c
+
+# GFX950:   v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13]
+0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13
+
+# GFX950:   v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04]
+0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04
+
+# GFX950:   v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], a[0:3], v[4:7] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c]
+0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c
+
+# GFX950:   v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b]
+0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b
+
+# GFX950:   v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04]
+0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04
+
+# GFX950:   v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4]
+0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4
+
+# GFX950:   v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b]
+0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b
+
+# GFX950:   v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c]
+0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c
+
+# GFX950:   v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c]
+0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c
+
+# GFX950:   v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c]
+0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c
+
+# GFX950:   v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03]
+0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03
+
+# GFX950:   v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04]
+0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04
+
+# GFX950:   v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04]
+0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04
+
+# GFX950:   v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4]
+0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4
+
+# GFX950:   v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04]
+0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0xca,0x1b]
+0x00,0x80,0xb7,0xd3,0x00,0x01,0xca,0x1b
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x1c]
+0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x1c
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x5c]
+0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x5c
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xb7,0xd3,0x00,0x01,0x02,0x1c]
+0x00,0x8b,0xb7,0xd3,0x00,0x01,0x02,0x1c
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0xca,0x03]
+0x00,0x00,0xb7,0xd3,0x00,0x01,0xca,0x03
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0x04]
+0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0x04
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xb7,0xd3,0x00,0x01,0x02,0x04]
+0x00,0x08,0xb7,0xd3,0x00,0x01,0x02,0x04
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0xa4]
+0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0xa4
+
+# GFX950:   v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xb7,0xd3,0x00,0x01,0x02,0x04]
+0x00,0x03,0xb7,0xd3,0x00,0x01,0x02,0x04
+
+
+# GFX950:   v_mfma_ld_scale_b32 0, 0               ; encoding: [0x00,0x40,0xac,0xd3,0x80,0x00,0x01,0x18]
+0x00,0x40,0xac,0xd3,0x80,0x00,0x01,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 1, 0               ; encoding: [0x00,0x40,0xac,0xd3,0x81,0x00,0x01,0x18]
+0x00,0x40,0xac,0xd3,0x81,0x00,0x01,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 64, 64             ; encoding: [0x00,0x40,0xac,0xd3,0xc0,0x80,0x01,0x18]
+0x00,0x40,0xac,0xd3,0xc0,0x80,0x01,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 64, v0             ; encoding: [0x00,0x40,0xac,0xd3,0xc0,0x00,0x02,0x18]
+0x00,0x40,0xac,0xd3,0xc0,0x00,0x02,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 m0, m0             ; encoding: [0x00,0x40,0xac,0xd3,0x7c,0xf8,0x00,0x18]
+0x00,0x40,0xac,0xd3,0x7c,0xf8,0x00,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 s0, s0             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x00,0x00,0x18]
+0x00,0x40,0xac,0xd3,0x00,0x00,0x00,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 s0, v0             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x00,0x02,0x18]
+0x00,0x40,0xac,0xd3,0x00,0x00,0x02,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 src_execz, src_execz ; encoding: [0x00,0x40,0xac,0xd3,0xfc,0xf8,0x01,0x18]
+0x00,0x40,0xac,0xd3,0xfc,0xf8,0x01,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 src_vccz, src_vccz ; encoding: [0x00,0x40,0xac,0xd3,0xfb,0xf6,0x01,0x18]
+0x00,0x40,0xac,0xd3,0xfb,0xf6,0x01,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 v0, 64             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x81,0x01,0x18]
+0x00,0x40,0xac,0xd3,0x00,0x81,0x01,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 v0, s0             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x01,0x00,0x18]
+0x00,0x40,0xac,0xd3,0x00,0x01,0x00,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 v0, v0             ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x01,0x02,0x18]
+0x00,0x40,0xac,0xd3,0x00,0x01,0x02,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1             ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x18]
+0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18]
+0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x10]
+0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x10
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x08]
+0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x08
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x18]
+0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x10]
+0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x10
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x08]
+0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x08
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel:[1,1] ; encoding: [0x00,0x58,0xac,0xd3,0x01,0x03,0x02,0x18]
+0x00,0x58,0xac,0xd3,0x01,0x03,0x02,0x18
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x00]
+0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x00
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x10]
+0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x10
+
+# GFX950:   v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x08]
+0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x08
+
+# GFX950:   v_mfma_ld_scale_b32 vcc_lo, vcc_lo     ; encoding: [0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18]
+0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18
diff --git a/llvm/test/MC/ELF/relocation-alias.s b/llvm/test/MC/ELF/relocation-alias.s
index 66bf2ceea508b..7701f1107e5a6 100644
--- a/llvm/test/MC/ELF/relocation-alias.s
+++ b/llvm/test/MC/ELF/relocation-alias.s
@@ -17,7 +17,7 @@ movabsq $memcpy+2, %rax
 # CHECK:      movq (%rip), %rax
 # CHECK-NEXT:   R_X86_64_REX_GOTPCRELX  abs-0x4
 # CHECK:      movq (%rip), %r16
-# CHECK-NEXT:   R_X86_64_REX2_GOTPCRELX abs-0x4
+# CHECK-NEXT:   R_X86_64_CODE_4_GOTPCRELX abs-0x4
 movq abs@GOTPCREL(%rip), %rax
 movq abs@GOTPCREL(%rip), %r16
 abs = 42
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
index 1c1c658ad440f..0d81ddd6763b5 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
@@ -91,3 +91,14 @@ amxor.w $a0, $a1, $a0
 amadd.d $a0, $a1, $a2, $a3
 # CHECK: :[[#@LINE+1]]:24: error: optional integer offset must be 0
 amadd.d $a0, $a1, $a2, 1
+
+## According to experiment results on real LA664 HW, the AMCAS instructions
+## are subject to the same constraint as the other 3-register atomic insns.
+## This is undocumented in v1.10 of the LoongArch Reference Manual.
+
+# CHECK: :[[#@LINE+1]]:10: error: $rd must be different from both $rk and $rj
+amcas.b $a0, $a0, $a0
+# CHECK: :[[#@LINE+1]]:10: error: $rd must be different from both $rk and $rj
+amcas.h $a0, $a0, $a1
+# CHECK: :[[#@LINE+1]]:13: error: $rd must be different from both $rk and $rj
+amcas_db.w $a0, $a1, $a0
diff --git a/llvm/test/MC/X86/gotpcrelx.s b/llvm/test/MC/X86/gotpcrelx.s
index 5a8ba454bc904..e88c514b22690 100644
--- a/llvm/test/MC/X86/gotpcrelx.s
+++ b/llvm/test/MC/X86/gotpcrelx.s
@@ -37,16 +37,16 @@
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX sbb
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX sub
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX xor
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX mov
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX test
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX adc
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX add
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX and
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX cmp
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX or
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX sbb
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX sub
-# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX xor
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX mov
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX test
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX adc
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX add
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX and
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX cmp
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX or
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX sbb
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX sub
+# CHECK-NEXT:     R_X86_64_CODE_4_GOTPCRELX xor
 # CHECK-NEXT:   }
 
 # NORELAX-NEXT:     R_X86_64_GOTPCREL mov
diff --git a/llvm/test/MC/X86/reloc-directive-elf-64.s b/llvm/test/MC/X86/reloc-directive-elf-64.s
index 323603efc7061..e0a1a5730597f 100644
--- a/llvm/test/MC/X86/reloc-directive-elf-64.s
+++ b/llvm/test/MC/X86/reloc-directive-elf-64.s
@@ -9,7 +9,7 @@
 # PRINT-NEXT: .reloc 0, R_X86_64_64, .data+2
 # PRINT-NEXT: .reloc 0, R_X86_64_GOTPCRELX, foo+3
 # PRINT-NEXT: .reloc 0, R_X86_64_REX_GOTPCRELX, 5
-# PRINT-NEXT: .reloc 0, R_X86_64_REX2_GOTPCRELX, 7
+# PRINT-NEXT: .reloc 0, R_X86_64_CODE_4_GOTPCRELX, 7
 # PRINT:      .reloc 0, BFD_RELOC_NONE, 9
 # PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
 # PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
@@ -22,7 +22,7 @@
 # CHECK-NEXT: 0x0 R_X86_64_64 .data 0x2
 # CHECK-NEXT: 0x0 R_X86_64_GOTPCRELX foo 0x3
 # CHECK-NEXT: 0x0 R_X86_64_REX_GOTPCRELX - 0x5
-# CHECK-NEXT: 0x0 R_X86_64_REX2_GOTPCRELX - 0x7
+# CHECK-NEXT: 0x0 R_X86_64_CODE_4_GOTPCRELX - 0x7
 # CHECK-NEXT: 0x0 R_X86_64_NONE - 0x9
 # CHECK-NEXT: 0x0 R_X86_64_8 - 0x9
 # CHECK-NEXT: 0x0 R_X86_64_16 - 0x9
@@ -39,7 +39,7 @@
   .reloc 0, R_X86_64_64, .data+2
   .reloc 0, R_X86_64_GOTPCRELX, foo+3
   .reloc 0, R_X86_64_REX_GOTPCRELX, 5
-  .reloc 0, R_X86_64_REX2_GOTPCRELX, 7
+  .reloc 0, R_X86_64_CODE_4_GOTPCRELX, 7
 
   .reloc 0, BFD_RELOC_NONE, 9
   .reloc 0, BFD_RELOC_8, 9
diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
index 9c79ea588f624..416419b3a333f 100644
--- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -162,6 +162,10 @@
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX942 %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX942 %s
 
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX950/' %s | yaml2obj -o %t.o.AMDGCN_GFX950
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX950 %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX950 %s
+
 # RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX1010/' %s | yaml2obj -o %t.o.AMDGCN_GFX1010
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1010 %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1010 %s
@@ -411,6 +415,9 @@
 # ELF-AMDGCN-GFX942:    EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C)
 # YAML-AMDGCN-GFX942:   Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX942 ]
 
+# ELF-AMDGCN-GFX950:    EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
+# YAML-AMDGCN-GFX950:   Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX950 ]
+
 # ELF-AMDGCN-GFX1010:   EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
 # YAML-AMDGCN-GFX1010:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1010 ]
 
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 21f3c8593a710..621e9e0abeb28 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -487,19 +487,19 @@ static const X86FoldTableEntry Table0[] = {
   {X86::VCVTPS2PHZ256rr, X86::VCVTPS2PHZ256mr, TB_FOLDED_STORE},
   {X86::VCVTPS2PHZrr, X86::VCVTPS2PHZmr, TB_FOLDED_STORE},
   {X86::VEXTRACTF128rri, X86::VEXTRACTF128mri, TB_FOLDED_STORE},
-  {X86::VEXTRACTF32x4Z256rri, X86::VEXTRACTF32x4Z256mri, TB_FOLDED_STORE},
-  {X86::VEXTRACTF32x4Zrri, X86::VEXTRACTF32x4Zmri, TB_FOLDED_STORE},
-  {X86::VEXTRACTF32x8Zrri, X86::VEXTRACTF32x8Zmri, TB_FOLDED_STORE},
-  {X86::VEXTRACTF64x2Z256rri, X86::VEXTRACTF64x2Z256mri, TB_FOLDED_STORE},
-  {X86::VEXTRACTF64x2Zrri, X86::VEXTRACTF64x2Zmri, TB_FOLDED_STORE},
-  {X86::VEXTRACTF64x4Zrri, X86::VEXTRACTF64x4Zmri, TB_FOLDED_STORE},
+  {X86::VEXTRACTF32X4Z256rri, X86::VEXTRACTF32X4Z256mri, TB_FOLDED_STORE},
+  {X86::VEXTRACTF32X4Zrri, X86::VEXTRACTF32X4Zmri, TB_FOLDED_STORE},
+  {X86::VEXTRACTF32X8Zrri, X86::VEXTRACTF32X8Zmri, TB_FOLDED_STORE},
+  {X86::VEXTRACTF64X2Z256rri, X86::VEXTRACTF64X2Z256mri, TB_FOLDED_STORE},
+  {X86::VEXTRACTF64X2Zrri, X86::VEXTRACTF64X2Zmri, TB_FOLDED_STORE},
+  {X86::VEXTRACTF64X4Zrri, X86::VEXTRACTF64X4Zmri, TB_FOLDED_STORE},
   {X86::VEXTRACTI128rri, X86::VEXTRACTI128mri, TB_FOLDED_STORE},
-  {X86::VEXTRACTI32x4Z256rri, X86::VEXTRACTI32x4Z256mri, TB_FOLDED_STORE},
-  {X86::VEXTRACTI32x4Zrri, X86::VEXTRACTI32x4Zmri, TB_FOLDED_STORE},
-  {X86::VEXTRACTI32x8Zrri, X86::VEXTRACTI32x8Zmri, TB_FOLDED_STORE},
-  {X86::VEXTRACTI64x2Z256rri, X86::VEXTRACTI64x2Z256mri, TB_FOLDED_STORE},
-  {X86::VEXTRACTI64x2Zrri, X86::VEXTRACTI64x2Zmri, TB_FOLDED_STORE},
-  {X86::VEXTRACTI64x4Zrri, X86::VEXTRACTI64x4Zmri, TB_FOLDED_STORE},
+  {X86::VEXTRACTI32X4Z256rri, X86::VEXTRACTI32X4Z256mri, TB_FOLDED_STORE},
+  {X86::VEXTRACTI32X4Zrri, X86::VEXTRACTI32X4Zmri, TB_FOLDED_STORE},
+  {X86::VEXTRACTI32X8Zrri, X86::VEXTRACTI32X8Zmri, TB_FOLDED_STORE},
+  {X86::VEXTRACTI64X2Z256rri, X86::VEXTRACTI64X2Z256mri, TB_FOLDED_STORE},
+  {X86::VEXTRACTI64X2Zrri, X86::VEXTRACTI64X2Zmri, TB_FOLDED_STORE},
+  {X86::VEXTRACTI64X4Zrri, X86::VEXTRACTI64X4Zmri, TB_FOLDED_STORE},
   {X86::VEXTRACTPSZrri, X86::VEXTRACTPSZmri, TB_FOLDED_STORE},
   {X86::VEXTRACTPSrri, X86::VEXTRACTPSmri, TB_FOLDED_STORE},
   {X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE|TB_NO_REVERSE},
@@ -1506,21 +1506,21 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VEXPANDPSZ128rr, X86::VEXPANDPSZ128rm, TB_NO_REVERSE},
   {X86::VEXPANDPSZ256rr, X86::VEXPANDPSZ256rm, TB_NO_REVERSE},
   {X86::VEXPANDPSZrr, X86::VEXPANDPSZrm, TB_NO_REVERSE},
-  {X86::VFPCLASSPBF16Z128rr, X86::VFPCLASSPBF16Z128rm, 0},
-  {X86::VFPCLASSPBF16Z256rr, X86::VFPCLASSPBF16Z256rm, 0},
-  {X86::VFPCLASSPBF16Zrr, X86::VFPCLASSPBF16Zrm, 0},
-  {X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rm, 0},
-  {X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rm, 0},
-  {X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrm, 0},
-  {X86::VFPCLASSPHZ128rr, X86::VFPCLASSPHZ128rm, 0},
-  {X86::VFPCLASSPHZ256rr, X86::VFPCLASSPHZ256rm, 0},
-  {X86::VFPCLASSPHZrr, X86::VFPCLASSPHZrm, 0},
-  {X86::VFPCLASSPSZ128rr, X86::VFPCLASSPSZ128rm, 0},
-  {X86::VFPCLASSPSZ256rr, X86::VFPCLASSPSZ256rm, 0},
-  {X86::VFPCLASSPSZrr, X86::VFPCLASSPSZrm, 0},
-  {X86::VFPCLASSSDZrr, X86::VFPCLASSSDZrm, TB_NO_REVERSE},
-  {X86::VFPCLASSSHZrr, X86::VFPCLASSSHZrm, TB_NO_REVERSE},
-  {X86::VFPCLASSSSZrr, X86::VFPCLASSSSZrm, TB_NO_REVERSE},
+  {X86::VFPCLASSPBF16Z128ri, X86::VFPCLASSPBF16Z128mi, 0},
+  {X86::VFPCLASSPBF16Z256ri, X86::VFPCLASSPBF16Z256mi, 0},
+  {X86::VFPCLASSPBF16Zri, X86::VFPCLASSPBF16Zmi, 0},
+  {X86::VFPCLASSPDZ128ri, X86::VFPCLASSPDZ128mi, 0},
+  {X86::VFPCLASSPDZ256ri, X86::VFPCLASSPDZ256mi, 0},
+  {X86::VFPCLASSPDZri, X86::VFPCLASSPDZmi, 0},
+  {X86::VFPCLASSPHZ128ri, X86::VFPCLASSPHZ128mi, 0},
+  {X86::VFPCLASSPHZ256ri, X86::VFPCLASSPHZ256mi, 0},
+  {X86::VFPCLASSPHZri, X86::VFPCLASSPHZmi, 0},
+  {X86::VFPCLASSPSZ128ri, X86::VFPCLASSPSZ128mi, 0},
+  {X86::VFPCLASSPSZ256ri, X86::VFPCLASSPSZ256mi, 0},
+  {X86::VFPCLASSPSZri, X86::VFPCLASSPSZmi, 0},
+  {X86::VFPCLASSSDZri, X86::VFPCLASSSDZmi, TB_NO_REVERSE},
+  {X86::VFPCLASSSHZri, X86::VFPCLASSSHZmi, TB_NO_REVERSE},
+  {X86::VFPCLASSSSZri, X86::VFPCLASSSSZmi, TB_NO_REVERSE},
   {X86::VFRCZPDYrr, X86::VFRCZPDYrm, 0},
   {X86::VFRCZPDrr, X86::VFRCZPDrm, 0},
   {X86::VFRCZPSYrr, X86::VFRCZPSYrm, 0},
@@ -2929,21 +2929,21 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE},
   {X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0},
   {X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE},
-  {X86::VFPCLASSPBF16Z128rrk, X86::VFPCLASSPBF16Z128rmk, 0},
-  {X86::VFPCLASSPBF16Z256rrk, X86::VFPCLASSPBF16Z256rmk, 0},
-  {X86::VFPCLASSPBF16Zrrk, X86::VFPCLASSPBF16Zrmk, 0},
-  {X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmk, 0},
-  {X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmk, 0},
-  {X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmk, 0},
-  {X86::VFPCLASSPHZ128rrk, X86::VFPCLASSPHZ128rmk, 0},
-  {X86::VFPCLASSPHZ256rrk, X86::VFPCLASSPHZ256rmk, 0},
-  {X86::VFPCLASSPHZrrk, X86::VFPCLASSPHZrmk, 0},
-  {X86::VFPCLASSPSZ128rrk, X86::VFPCLASSPSZ128rmk, 0},
-  {X86::VFPCLASSPSZ256rrk, X86::VFPCLASSPSZ256rmk, 0},
-  {X86::VFPCLASSPSZrrk, X86::VFPCLASSPSZrmk, 0},
-  {X86::VFPCLASSSDZrrk, X86::VFPCLASSSDZrmk, TB_NO_REVERSE},
-  {X86::VFPCLASSSHZrrk, X86::VFPCLASSSHZrmk, TB_NO_REVERSE},
-  {X86::VFPCLASSSSZrrk, X86::VFPCLASSSSZrmk, TB_NO_REVERSE},
+  {X86::VFPCLASSPBF16Z128rik, X86::VFPCLASSPBF16Z128mik, 0},
+  {X86::VFPCLASSPBF16Z256rik, X86::VFPCLASSPBF16Z256mik, 0},
+  {X86::VFPCLASSPBF16Zrik, X86::VFPCLASSPBF16Zmik, 0},
+  {X86::VFPCLASSPDZ128rik, X86::VFPCLASSPDZ128mik, 0},
+  {X86::VFPCLASSPDZ256rik, X86::VFPCLASSPDZ256mik, 0},
+  {X86::VFPCLASSPDZrik, X86::VFPCLASSPDZmik, 0},
+  {X86::VFPCLASSPHZ128rik, X86::VFPCLASSPHZ128mik, 0},
+  {X86::VFPCLASSPHZ256rik, X86::VFPCLASSPHZ256mik, 0},
+  {X86::VFPCLASSPHZrik, X86::VFPCLASSPHZmik, 0},
+  {X86::VFPCLASSPSZ128rik, X86::VFPCLASSPSZ128mik, 0},
+  {X86::VFPCLASSPSZ256rik, X86::VFPCLASSPSZ256mik, 0},
+  {X86::VFPCLASSPSZrik, X86::VFPCLASSPSZmik, 0},
+  {X86::VFPCLASSSDZrik, X86::VFPCLASSSDZmik, TB_NO_REVERSE},
+  {X86::VFPCLASSSHZrik, X86::VFPCLASSSHZmik, TB_NO_REVERSE},
+  {X86::VFPCLASSSSZrik, X86::VFPCLASSSSZmik, TB_NO_REVERSE},
   {X86::VGETEXPPBF16Z128rkz, X86::VGETEXPPBF16Z128mkz, 0},
   {X86::VGETEXPPBF16Z256rkz, X86::VGETEXPPBF16Z256mkz, 0},
   {X86::VGETEXPPBF16Zrkz, X86::VGETEXPPBF16Zmkz, 0},
@@ -2998,19 +2998,19 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0},
   {X86::VHSUBPSrr, X86::VHSUBPSrm, 0},
   {X86::VINSERTF128rri, X86::VINSERTF128rmi, 0},
-  {X86::VINSERTF32x4Z256rri, X86::VINSERTF32x4Z256rmi, 0},
-  {X86::VINSERTF32x4Zrri, X86::VINSERTF32x4Zrmi, 0},
-  {X86::VINSERTF32x8Zrri, X86::VINSERTF32x8Zrmi, 0},
-  {X86::VINSERTF64x2Z256rri, X86::VINSERTF64x2Z256rmi, 0},
-  {X86::VINSERTF64x2Zrri, X86::VINSERTF64x2Zrmi, 0},
-  {X86::VINSERTF64x4Zrri, X86::VINSERTF64x4Zrmi, 0},
+  {X86::VINSERTF32X4Z256rri, X86::VINSERTF32X4Z256rmi, 0},
+  {X86::VINSERTF32X4Zrri, X86::VINSERTF32X4Zrmi, 0},
+  {X86::VINSERTF32X8Zrri, X86::VINSERTF32X8Zrmi, 0},
+  {X86::VINSERTF64X2Z256rri, X86::VINSERTF64X2Z256rmi, 0},
+  {X86::VINSERTF64X2Zrri, X86::VINSERTF64X2Zrmi, 0},
+  {X86::VINSERTF64X4Zrri, X86::VINSERTF64X4Zrmi, 0},
   {X86::VINSERTI128rri, X86::VINSERTI128rmi, 0},
-  {X86::VINSERTI32x4Z256rri, X86::VINSERTI32x4Z256rmi, 0},
-  {X86::VINSERTI32x4Zrri, X86::VINSERTI32x4Zrmi, 0},
-  {X86::VINSERTI32x8Zrri, X86::VINSERTI32x8Zrmi, 0},
-  {X86::VINSERTI64x2Z256rri, X86::VINSERTI64x2Z256rmi, 0},
-  {X86::VINSERTI64x2Zrri, X86::VINSERTI64x2Zrmi, 0},
-  {X86::VINSERTI64x4Zrri, X86::VINSERTI64x4Zrmi, 0},
+  {X86::VINSERTI32X4Z256rri, X86::VINSERTI32X4Z256rmi, 0},
+  {X86::VINSERTI32X4Zrri, X86::VINSERTI32X4Zrmi, 0},
+  {X86::VINSERTI32X8Zrri, X86::VINSERTI32X8Zrmi, 0},
+  {X86::VINSERTI64X2Z256rri, X86::VINSERTI64X2Z256rmi, 0},
+  {X86::VINSERTI64X2Zrri, X86::VINSERTI64X2Zrmi, 0},
+  {X86::VINSERTI64X4Zrri, X86::VINSERTI64X4Zrmi, 0},
   {X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0},
   {X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0},
   {X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0},
@@ -5074,18 +5074,18 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VGF2P8MULBZ128rrkz, X86::VGF2P8MULBZ128rmkz, 0},
   {X86::VGF2P8MULBZ256rrkz, X86::VGF2P8MULBZ256rmkz, 0},
   {X86::VGF2P8MULBZrrkz, X86::VGF2P8MULBZrmkz, 0},
-  {X86::VINSERTF32x4Z256rrikz, X86::VINSERTF32x4Z256rmikz, 0},
-  {X86::VINSERTF32x4Zrrikz, X86::VINSERTF32x4Zrmikz, 0},
-  {X86::VINSERTF32x8Zrrikz, X86::VINSERTF32x8Zrmikz, 0},
-  {X86::VINSERTF64x2Z256rrikz, X86::VINSERTF64x2Z256rmikz, 0},
-  {X86::VINSERTF64x2Zrrikz, X86::VINSERTF64x2Zrmikz, 0},
-  {X86::VINSERTF64x4Zrrikz, X86::VINSERTF64x4Zrmikz, 0},
-  {X86::VINSERTI32x4Z256rrikz, X86::VINSERTI32x4Z256rmikz, 0},
-  {X86::VINSERTI32x4Zrrikz, X86::VINSERTI32x4Zrmikz, 0},
-  {X86::VINSERTI32x8Zrrikz, X86::VINSERTI32x8Zrmikz, 0},
-  {X86::VINSERTI64x2Z256rrikz, X86::VINSERTI64x2Z256rmikz, 0},
-  {X86::VINSERTI64x2Zrrikz, X86::VINSERTI64x2Zrmikz, 0},
-  {X86::VINSERTI64x4Zrrikz, X86::VINSERTI64x4Zrmikz, 0},
+  {X86::VINSERTF32X4Z256rrikz, X86::VINSERTF32X4Z256rmikz, 0},
+  {X86::VINSERTF32X4Zrrikz, X86::VINSERTF32X4Zrmikz, 0},
+  {X86::VINSERTF32X8Zrrikz, X86::VINSERTF32X8Zrmikz, 0},
+  {X86::VINSERTF64X2Z256rrikz, X86::VINSERTF64X2Z256rmikz, 0},
+  {X86::VINSERTF64X2Zrrikz, X86::VINSERTF64X2Zrmikz, 0},
+  {X86::VINSERTF64X4Zrrikz, X86::VINSERTF64X4Zrmikz, 0},
+  {X86::VINSERTI32X4Z256rrikz, X86::VINSERTI32X4Z256rmikz, 0},
+  {X86::VINSERTI32X4Zrrikz, X86::VINSERTI32X4Zrmikz, 0},
+  {X86::VINSERTI32X8Zrrikz, X86::VINSERTI32X8Zrmikz, 0},
+  {X86::VINSERTI64X2Z256rrikz, X86::VINSERTI64X2Z256rmikz, 0},
+  {X86::VINSERTI64X2Zrrikz, X86::VINSERTI64X2Zrmikz, 0},
+  {X86::VINSERTI64X4Zrrikz, X86::VINSERTI64X4Zrmikz, 0},
   {X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0},
   {X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0},
   {X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0},
@@ -6696,18 +6696,18 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VGF2P8MULBZ128rrk, X86::VGF2P8MULBZ128rmk, 0},
   {X86::VGF2P8MULBZ256rrk, X86::VGF2P8MULBZ256rmk, 0},
   {X86::VGF2P8MULBZrrk, X86::VGF2P8MULBZrmk, 0},
-  {X86::VINSERTF32x4Z256rrik, X86::VINSERTF32x4Z256rmik, 0},
-  {X86::VINSERTF32x4Zrrik, X86::VINSERTF32x4Zrmik, 0},
-  {X86::VINSERTF32x8Zrrik, X86::VINSERTF32x8Zrmik, 0},
-  {X86::VINSERTF64x2Z256rrik, X86::VINSERTF64x2Z256rmik, 0},
-  {X86::VINSERTF64x2Zrrik, X86::VINSERTF64x2Zrmik, 0},
-  {X86::VINSERTF64x4Zrrik, X86::VINSERTF64x4Zrmik, 0},
-  {X86::VINSERTI32x4Z256rrik, X86::VINSERTI32x4Z256rmik, 0},
-  {X86::VINSERTI32x4Zrrik, X86::VINSERTI32x4Zrmik, 0},
-  {X86::VINSERTI32x8Zrrik, X86::VINSERTI32x8Zrmik, 0},
-  {X86::VINSERTI64x2Z256rrik, X86::VINSERTI64x2Z256rmik, 0},
-  {X86::VINSERTI64x2Zrrik, X86::VINSERTI64x2Zrmik, 0},
-  {X86::VINSERTI64x4Zrrik, X86::VINSERTI64x4Zrmik, 0},
+  {X86::VINSERTF32X4Z256rrik, X86::VINSERTF32X4Z256rmik, 0},
+  {X86::VINSERTF32X4Zrrik, X86::VINSERTF32X4Zrmik, 0},
+  {X86::VINSERTF32X8Zrrik, X86::VINSERTF32X8Zrmik, 0},
+  {X86::VINSERTF64X2Z256rrik, X86::VINSERTF64X2Z256rmik, 0},
+  {X86::VINSERTF64X2Zrrik, X86::VINSERTF64X2Zrmik, 0},
+  {X86::VINSERTF64X4Zrrik, X86::VINSERTF64X4Zrmik, 0},
+  {X86::VINSERTI32X4Z256rrik, X86::VINSERTI32X4Z256rmik, 0},
+  {X86::VINSERTI32X4Zrrik, X86::VINSERTI32X4Zrmik, 0},
+  {X86::VINSERTI32X8Zrrik, X86::VINSERTI32X8Zrmik, 0},
+  {X86::VINSERTI64X2Z256rrik, X86::VINSERTI64X2Z256rmik, 0},
+  {X86::VINSERTI64X2Zrrik, X86::VINSERTI64X2Zrmik, 0},
+  {X86::VINSERTI64X4Zrrik, X86::VINSERTI64X4Zrmik, 0},
   {X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0},
   {X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0},
   {X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0},
@@ -7641,18 +7641,18 @@ static const X86FoldTableEntry BroadcastTable1[] = {
   {X86::VCVTW2PHZrr, X86::VCVTW2PHZrmb, TB_BCAST_W},
   {X86::VEXP2PDZr, X86::VEXP2PDZmb, TB_BCAST_SD},
   {X86::VEXP2PSZr, X86::VEXP2PSZmb, TB_BCAST_SS},
-  {X86::VFPCLASSPBF16Z128rr, X86::VFPCLASSPBF16Z128rmb, TB_BCAST_SH},
-  {X86::VFPCLASSPBF16Z256rr, X86::VFPCLASSPBF16Z256rmb, TB_BCAST_SH},
-  {X86::VFPCLASSPBF16Zrr, X86::VFPCLASSPBF16Zrmb, TB_BCAST_SH},
-  {X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rmb, TB_BCAST_SD},
-  {X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rmb, TB_BCAST_SD},
-  {X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrmb, TB_BCAST_SD},
-  {X86::VFPCLASSPHZ128rr, X86::VFPCLASSPHZ128rmb, TB_BCAST_SH},
-  {X86::VFPCLASSPHZ256rr, X86::VFPCLASSPHZ256rmb, TB_BCAST_SH},
-  {X86::VFPCLASSPHZrr, X86::VFPCLASSPHZrmb, TB_BCAST_SH},
-  {X86::VFPCLASSPSZ128rr, X86::VFPCLASSPSZ128rmb, TB_BCAST_SS},
-  {X86::VFPCLASSPSZ256rr, X86::VFPCLASSPSZ256rmb, TB_BCAST_SS},
-  {X86::VFPCLASSPSZrr, X86::VFPCLASSPSZrmb, TB_BCAST_SS},
+  {X86::VFPCLASSPBF16Z128ri, X86::VFPCLASSPBF16Z128mbi, TB_BCAST_SH},
+  {X86::VFPCLASSPBF16Z256ri, X86::VFPCLASSPBF16Z256mbi, TB_BCAST_SH},
+  {X86::VFPCLASSPBF16Zri, X86::VFPCLASSPBF16Zmbi, TB_BCAST_SH},
+  {X86::VFPCLASSPDZ128ri, X86::VFPCLASSPDZ128mbi, TB_BCAST_SD},
+  {X86::VFPCLASSPDZ256ri, X86::VFPCLASSPDZ256mbi, TB_BCAST_SD},
+  {X86::VFPCLASSPDZri, X86::VFPCLASSPDZmbi, TB_BCAST_SD},
+  {X86::VFPCLASSPHZ128ri, X86::VFPCLASSPHZ128mbi, TB_BCAST_SH},
+  {X86::VFPCLASSPHZ256ri, X86::VFPCLASSPHZ256mbi, TB_BCAST_SH},
+  {X86::VFPCLASSPHZri, X86::VFPCLASSPHZmbi, TB_BCAST_SH},
+  {X86::VFPCLASSPSZ128ri, X86::VFPCLASSPSZ128mbi, TB_BCAST_SS},
+  {X86::VFPCLASSPSZ256ri, X86::VFPCLASSPSZ256mbi, TB_BCAST_SS},
+  {X86::VFPCLASSPSZri, X86::VFPCLASSPSZmbi, TB_BCAST_SS},
   {X86::VGETEXPPBF16Z128r, X86::VGETEXPPBF16Z128mb, TB_BCAST_SH},
   {X86::VGETEXPPBF16Z256r, X86::VGETEXPPBF16Z256mb, TB_BCAST_SH},
   {X86::VGETEXPPBF16Zr, X86::VGETEXPPBF16Zmb, TB_BCAST_SH},
@@ -8128,18 +8128,18 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VFMULCPHZ128rr, X86::VFMULCPHZ128rmb, TB_BCAST_SS},
   {X86::VFMULCPHZ256rr, X86::VFMULCPHZ256rmb, TB_BCAST_SS},
   {X86::VFMULCPHZrr, X86::VFMULCPHZrmb, TB_BCAST_SS},
-  {X86::VFPCLASSPBF16Z128rrk, X86::VFPCLASSPBF16Z128rmbk, TB_BCAST_SH},
-  {X86::VFPCLASSPBF16Z256rrk, X86::VFPCLASSPBF16Z256rmbk, TB_BCAST_SH},
-  {X86::VFPCLASSPBF16Zrrk, X86::VFPCLASSPBF16Zrmbk, TB_BCAST_SH},
-  {X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmbk, TB_BCAST_SD},
-  {X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmbk, TB_BCAST_SD},
-  {X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmbk, TB_BCAST_SD},
-  {X86::VFPCLASSPHZ128rrk, X86::VFPCLASSPHZ128rmbk, TB_BCAST_SH},
-  {X86::VFPCLASSPHZ256rrk, X86::VFPCLASSPHZ256rmbk, TB_BCAST_SH},
-  {X86::VFPCLASSPHZrrk, X86::VFPCLASSPHZrmbk, TB_BCAST_SH},
-  {X86::VFPCLASSPSZ128rrk, X86::VFPCLASSPSZ128rmbk, TB_BCAST_SS},
-  {X86::VFPCLASSPSZ256rrk, X86::VFPCLASSPSZ256rmbk, TB_BCAST_SS},
-  {X86::VFPCLASSPSZrrk, X86::VFPCLASSPSZrmbk, TB_BCAST_SS},
+  {X86::VFPCLASSPBF16Z128rik, X86::VFPCLASSPBF16Z128mbik, TB_BCAST_SH},
+  {X86::VFPCLASSPBF16Z256rik, X86::VFPCLASSPBF16Z256mbik, TB_BCAST_SH},
+  {X86::VFPCLASSPBF16Zrik, X86::VFPCLASSPBF16Zmbik, TB_BCAST_SH},
+  {X86::VFPCLASSPDZ128rik, X86::VFPCLASSPDZ128mbik, TB_BCAST_SD},
+  {X86::VFPCLASSPDZ256rik, X86::VFPCLASSPDZ256mbik, TB_BCAST_SD},
+  {X86::VFPCLASSPDZrik, X86::VFPCLASSPDZmbik, TB_BCAST_SD},
+  {X86::VFPCLASSPHZ128rik, X86::VFPCLASSPHZ128mbik, TB_BCAST_SH},
+  {X86::VFPCLASSPHZ256rik, X86::VFPCLASSPHZ256mbik, TB_BCAST_SH},
+  {X86::VFPCLASSPHZrik, X86::VFPCLASSPHZmbik, TB_BCAST_SH},
+  {X86::VFPCLASSPSZ128rik, X86::VFPCLASSPSZ128mbik, TB_BCAST_SS},
+  {X86::VFPCLASSPSZ256rik, X86::VFPCLASSPSZ256mbik, TB_BCAST_SS},
+  {X86::VFPCLASSPSZrik, X86::VFPCLASSPSZmbik, TB_BCAST_SS},
   {X86::VGETEXPPBF16Z128rkz, X86::VGETEXPPBF16Z128mbkz, TB_BCAST_SH},
   {X86::VGETEXPPBF16Z256rkz, X86::VGETEXPPBF16Z256mbkz, TB_BCAST_SH},
   {X86::VGETEXPPBF16Zrkz, X86::VGETEXPPBF16Zmbkz, TB_BCAST_SH},
@@ -8284,9 +8284,9 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VPBLENDMQZ128rr, X86::VPBLENDMQZ128rmb, TB_BCAST_Q},
   {X86::VPBLENDMQZ256rr, X86::VPBLENDMQZ256rmb, TB_BCAST_Q},
   {X86::VPBLENDMQZrr, X86::VPBLENDMQZrmb, TB_BCAST_Q},
-  {X86::VPCMPDZ128rri, X86::VPCMPDZ128rmib, TB_BCAST_D},
-  {X86::VPCMPDZ256rri, X86::VPCMPDZ256rmib, TB_BCAST_D},
-  {X86::VPCMPDZrri, X86::VPCMPDZrmib, TB_BCAST_D},
+  {X86::VPCMPDZ128rri, X86::VPCMPDZ128rmbi, TB_BCAST_D},
+  {X86::VPCMPDZ256rri, X86::VPCMPDZ256rmbi, TB_BCAST_D},
+  {X86::VPCMPDZrri, X86::VPCMPDZrmbi, TB_BCAST_D},
   {X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rmb, TB_BCAST_D},
   {X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rmb, TB_BCAST_D},
   {X86::VPCMPEQDZrr, X86::VPCMPEQDZrmb, TB_BCAST_D},
@@ -8299,15 +8299,15 @@ static const X86FoldTableEntry BroadcastTable2[] = {
   {X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rmb, TB_BCAST_Q},
   {X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rmb, TB_BCAST_Q},
   {X86::VPCMPGTQZrr, X86::VPCMPGTQZrmb, TB_BCAST_Q},
-  {X86::VPCMPQZ128rri, X86::VPCMPQZ128rmib, TB_BCAST_Q},
-  {X86::VPCMPQZ256rri, X86::VPCMPQZ256rmib, TB_BCAST_Q},
-  {X86::VPCMPQZrri, X86::VPCMPQZrmib, TB_BCAST_Q},
-  {X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmib, TB_BCAST_D},
-  {X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmib, TB_BCAST_D},
-  {X86::VPCMPUDZrri, X86::VPCMPUDZrmib, TB_BCAST_D},
-  {X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmib, TB_BCAST_Q},
-  {X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmib, TB_BCAST_Q},
-  {X86::VPCMPUQZrri, X86::VPCMPUQZrmib, TB_BCAST_Q},
+  {X86::VPCMPQZ128rri, X86::VPCMPQZ128rmbi, TB_BCAST_Q},
+  {X86::VPCMPQZ256rri, X86::VPCMPQZ256rmbi, TB_BCAST_Q},
+  {X86::VPCMPQZrri, X86::VPCMPQZrmbi, TB_BCAST_Q},
+  {X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmbi, TB_BCAST_D},
+  {X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmbi, TB_BCAST_D},
+  {X86::VPCMPUDZrri, X86::VPCMPUDZrmbi, TB_BCAST_D},
+  {X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmbi, TB_BCAST_Q},
+  {X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmbi, TB_BCAST_Q},
+  {X86::VPCMPUQZrri, X86::VPCMPUQZrmbi, TB_BCAST_Q},
   {X86::VPCONFLICTDZ128rrkz, X86::VPCONFLICTDZ128rmbkz, TB_BCAST_D},
   {X86::VPCONFLICTDZ256rrkz, X86::VPCONFLICTDZ256rmbkz, TB_BCAST_D},
   {X86::VPCONFLICTDZrrkz, X86::VPCONFLICTDZrmbkz, TB_BCAST_D},
@@ -9306,9 +9306,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VPBLENDMQZ128rrk, X86::VPBLENDMQZ128rmbk, TB_BCAST_Q},
   {X86::VPBLENDMQZ256rrk, X86::VPBLENDMQZ256rmbk, TB_BCAST_Q},
   {X86::VPBLENDMQZrrk, X86::VPBLENDMQZrmbk, TB_BCAST_Q},
-  {X86::VPCMPDZ128rrik, X86::VPCMPDZ128rmibk, TB_BCAST_D},
-  {X86::VPCMPDZ256rrik, X86::VPCMPDZ256rmibk, TB_BCAST_D},
-  {X86::VPCMPDZrrik, X86::VPCMPDZrmibk, TB_BCAST_D},
+  {X86::VPCMPDZ128rrik, X86::VPCMPDZ128rmbik, TB_BCAST_D},
+  {X86::VPCMPDZ256rrik, X86::VPCMPDZ256rmbik, TB_BCAST_D},
+  {X86::VPCMPDZrrik, X86::VPCMPDZrmbik, TB_BCAST_D},
   {X86::VPCMPEQDZ128rrk, X86::VPCMPEQDZ128rmbk, TB_BCAST_D},
   {X86::VPCMPEQDZ256rrk, X86::VPCMPEQDZ256rmbk, TB_BCAST_D},
   {X86::VPCMPEQDZrrk, X86::VPCMPEQDZrmbk, TB_BCAST_D},
@@ -9321,15 +9321,15 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VPCMPGTQZ128rrk, X86::VPCMPGTQZ128rmbk, TB_BCAST_Q},
   {X86::VPCMPGTQZ256rrk, X86::VPCMPGTQZ256rmbk, TB_BCAST_Q},
   {X86::VPCMPGTQZrrk, X86::VPCMPGTQZrmbk, TB_BCAST_Q},
-  {X86::VPCMPQZ128rrik, X86::VPCMPQZ128rmibk, TB_BCAST_Q},
-  {X86::VPCMPQZ256rrik, X86::VPCMPQZ256rmibk, TB_BCAST_Q},
-  {X86::VPCMPQZrrik, X86::VPCMPQZrmibk, TB_BCAST_Q},
-  {X86::VPCMPUDZ128rrik, X86::VPCMPUDZ128rmibk, TB_BCAST_D},
-  {X86::VPCMPUDZ256rrik, X86::VPCMPUDZ256rmibk, TB_BCAST_D},
-  {X86::VPCMPUDZrrik, X86::VPCMPUDZrmibk, TB_BCAST_D},
-  {X86::VPCMPUQZ128rrik, X86::VPCMPUQZ128rmibk, TB_BCAST_Q},
-  {X86::VPCMPUQZ256rrik, X86::VPCMPUQZ256rmibk, TB_BCAST_Q},
-  {X86::VPCMPUQZrrik, X86::VPCMPUQZrmibk, TB_BCAST_Q},
+  {X86::VPCMPQZ128rrik, X86::VPCMPQZ128rmbik, TB_BCAST_Q},
+  {X86::VPCMPQZ256rrik, X86::VPCMPQZ256rmbik, TB_BCAST_Q},
+  {X86::VPCMPQZrrik, X86::VPCMPQZrmbik, TB_BCAST_Q},
+  {X86::VPCMPUDZ128rrik, X86::VPCMPUDZ128rmbik, TB_BCAST_D},
+  {X86::VPCMPUDZ256rrik, X86::VPCMPUDZ256rmbik, TB_BCAST_D},
+  {X86::VPCMPUDZrrik, X86::VPCMPUDZrmbik, TB_BCAST_D},
+  {X86::VPCMPUQZ128rrik, X86::VPCMPUQZ128rmbik, TB_BCAST_Q},
+  {X86::VPCMPUQZ256rrik, X86::VPCMPUQZ256rmbik, TB_BCAST_Q},
+  {X86::VPCMPUQZrrik, X86::VPCMPUQZrmbik, TB_BCAST_Q},
   {X86::VPCONFLICTDZ128rrk, X86::VPCONFLICTDZ128rmbk, TB_BCAST_D},
   {X86::VPCONFLICTDZ256rrk, X86::VPCONFLICTDZ256rmbk, TB_BCAST_D},
   {X86::VPCONFLICTDZrrk, X86::VPCONFLICTDZrmbk, TB_BCAST_D},
diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc
index 256686612c6a2..b972427c2ff7a 100644
--- a/llvm/test/TableGen/x86-instr-mapping.inc
+++ b/llvm/test/TableGen/x86-instr-mapping.inc
@@ -360,14 +360,14 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::VDIVSSZrm_Int, X86::VDIVSSrm_Int },
   { X86::VDIVSSZrr, X86::VDIVSSrr },
   { X86::VDIVSSZrr_Int, X86::VDIVSSrr_Int },
-  { X86::VEXTRACTF32x4Z256mri, X86::VEXTRACTF128mri },
-  { X86::VEXTRACTF32x4Z256rri, X86::VEXTRACTF128rri },
-  { X86::VEXTRACTF64x2Z256mri, X86::VEXTRACTF128mri },
-  { X86::VEXTRACTF64x2Z256rri, X86::VEXTRACTF128rri },
-  { X86::VEXTRACTI32x4Z256mri, X86::VEXTRACTI128mri },
-  { X86::VEXTRACTI32x4Z256rri, X86::VEXTRACTI128rri },
-  { X86::VEXTRACTI64x2Z256mri, X86::VEXTRACTI128mri },
-  { X86::VEXTRACTI64x2Z256rri, X86::VEXTRACTI128rri },
+  { X86::VEXTRACTF32X4Z256mri, X86::VEXTRACTF128mri },
+  { X86::VEXTRACTF32X4Z256rri, X86::VEXTRACTF128rri },
+  { X86::VEXTRACTF64X2Z256mri, X86::VEXTRACTF128mri },
+  { X86::VEXTRACTF64X2Z256rri, X86::VEXTRACTF128rri },
+  { X86::VEXTRACTI32X4Z256mri, X86::VEXTRACTI128mri },
+  { X86::VEXTRACTI32X4Z256rri, X86::VEXTRACTI128rri },
+  { X86::VEXTRACTI64X2Z256mri, X86::VEXTRACTI128mri },
+  { X86::VEXTRACTI64X2Z256rri, X86::VEXTRACTI128rri },
   { X86::VEXTRACTPSZmri, X86::VEXTRACTPSmri },
   { X86::VEXTRACTPSZrri, X86::VEXTRACTPSrri },
   { X86::VFMADD132PDZ128m, X86::VFMADD132PDm },
@@ -622,14 +622,14 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::VGF2P8MULBZ128rr, X86::VGF2P8MULBrr },
   { X86::VGF2P8MULBZ256rm, X86::VGF2P8MULBYrm },
   { X86::VGF2P8MULBZ256rr, X86::VGF2P8MULBYrr },
-  { X86::VINSERTF32x4Z256rmi, X86::VINSERTF128rmi },
-  { X86::VINSERTF32x4Z256rri, X86::VINSERTF128rri },
-  { X86::VINSERTF64x2Z256rmi, X86::VINSERTF128rmi },
-  { X86::VINSERTF64x2Z256rri, X86::VINSERTF128rri },
-  { X86::VINSERTI32x4Z256rmi, X86::VINSERTI128rmi },
-  { X86::VINSERTI32x4Z256rri, X86::VINSERTI128rri },
-  { X86::VINSERTI64x2Z256rmi, X86::VINSERTI128rmi },
-  { X86::VINSERTI64x2Z256rri, X86::VINSERTI128rri },
+  { X86::VINSERTF32X4Z256rmi, X86::VINSERTF128rmi },
+  { X86::VINSERTF32X4Z256rri, X86::VINSERTF128rri },
+  { X86::VINSERTF64X2Z256rmi, X86::VINSERTF128rmi },
+  { X86::VINSERTF64X2Z256rri, X86::VINSERTF128rri },
+  { X86::VINSERTI32X4Z256rmi, X86::VINSERTI128rmi },
+  { X86::VINSERTI32X4Z256rri, X86::VINSERTI128rri },
+  { X86::VINSERTI64X2Z256rmi, X86::VINSERTI128rmi },
+  { X86::VINSERTI64X2Z256rri, X86::VINSERTI128rri },
   { X86::VINSERTPSZrmi, X86::VINSERTPSrmi },
   { X86::VINSERTPSZrri, X86::VINSERTPSrri },
   { X86::VMAXCPDZ128rm, X86::VMAXCPDrm },
diff --git a/llvm/test/ThinLTO/X86/Inputs/memprof-old-stackid-summary.bc b/llvm/test/ThinLTO/X86/Inputs/memprof-old-stackid-summary.bc
new file mode 100644
index 0000000000000..78b134cb44216
Binary files /dev/null and b/llvm/test/ThinLTO/X86/Inputs/memprof-old-stackid-summary.bc differ
diff --git a/llvm/test/ThinLTO/X86/memprof-old-stackid-summary.ll b/llvm/test/ThinLTO/X86/memprof-old-stackid-summary.ll
new file mode 100644
index 0000000000000..10048f8674a08
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-old-stackid-summary.ll
@@ -0,0 +1,20 @@
+;; Check that we can read the old STACK_ID summary format that encoded the id as
+;; a VBR8 instead of as a pair of 32-bit fixed-width values.
+;;
+;; The old bitcode was generated by the older compiler from `opt -thinlto-bc`
+;; on the following LLVM assembly:
+;;
+;; target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+;; target triple = "x86_64-unknown-linux-gnu"
+;;
+;; define void @bar() {
+;;   call void @foo(), !callsite !0
+;;   ret void
+;; }
+;;
+;; declare void @foo()
+;;
+;; !0 = !{i64 9086428284934609951}
+
+; RUN: llvm-dis %S/Inputs/memprof-old-stackid-summary.bc -o - | FileCheck %s
+; CHECK: stackIds: (9086428284934609951)
diff --git a/llvm/test/Transforms/GVN/tbaa.ll b/llvm/test/Transforms/GVN/tbaa.ll
index 46d1bb737a693..b5dd3867bdbc2 100644
--- a/llvm/test/Transforms/GVN/tbaa.ll
+++ b/llvm/test/Transforms/GVN/tbaa.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=gvn -S < %s | FileCheck %s
 
 define i32 @test1(ptr %p, ptr %q) {
-; CHECK-LABEL: @test1(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p)
-; CHECK-NOT: tbaa
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test1(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p)
   %c = add i32 %a, %b
@@ -12,9 +15,12 @@ define i32 @test1(ptr %p, ptr %q) {
 }
 
 define i32 @test2(ptr %p, ptr %q) {
-; CHECK-LABEL: @test2(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGC:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test2(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !0
   %c = add i32 %a, %b
@@ -22,9 +28,12 @@ define i32 @test2(ptr %p, ptr %q) {
 }
 
 define i32 @test3(ptr %p, ptr %q) {
-; CHECK-LABEL: @test3(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGB:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test3(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !3
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -32,9 +41,12 @@ define i32 @test3(ptr %p, ptr %q) {
 }
 
 define i32 @test4(ptr %p, ptr %q) {
-; CHECK-LABEL: @test4(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test4(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !1
   %b = call i32 @foo(ptr %p), !tbaa !0
   %c = add i32 %a, %b
@@ -42,9 +54,12 @@ define i32 @test4(ptr %p, ptr %q) {
 }
 
 define i32 @test5(ptr %p, ptr %q) {
-; CHECK-LABEL: @test5(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test5(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !1
   %c = add i32 %a, %b
@@ -52,9 +67,12 @@ define i32 @test5(ptr %p, ptr %q) {
 }
 
 define i32 @test6(ptr %p, ptr %q) {
-; CHECK-LABEL: @test6(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test6(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -62,10 +80,12 @@ define i32 @test6(ptr %p, ptr %q) {
 }
 
 define i32 @test7(ptr %p, ptr %q) {
-; CHECK-LABEL: @test7(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p)
-; CHECK-NOT: tbaa
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test7(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !4
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -73,9 +93,11 @@ define i32 @test7(ptr %p, ptr %q) {
 }
 
 define i32 @test8(ptr %p, ptr %q) {
-; CHECK-LABEL: @test8
-; CHECK-NEXT: store i32 15, ptr %p
-; CHECK-NEXT: ret i32 0
+; CHECK-LABEL: define i32 @test8(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    store i32 15, ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 0
+;
 ; Since we know the location is invariant, we can forward the
 ; load across the potentially aliasing store.
 
@@ -87,9 +109,11 @@ define i32 @test8(ptr %p, ptr %q) {
 }
 
 define i32 @test9(ptr %p, ptr %q) {
-; CHECK-LABEL: @test9
-; CHECK-NEXT: call void @clobber()
-; CHECK-NEXT: ret i32 0
+; CHECK-LABEL: define i32 @test9(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    ret i32 0
+;
 ; Since we know the location is invariant, we can forward the
 ; load across the potentially aliasing store (within the call).
 
@@ -103,9 +127,12 @@ define i32 @test9(ptr %p, ptr %q) {
 define i32 @test10(ptr %p, ptr %q) {
 ; If one access encloses the other, then the merged access is the enclosed one
 ; and not just the common final access type.
-; CHECK-LABEL: @test10
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAG_X_i:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test10(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA10:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !15  ; TAG_X_i
   %b = call i32 @foo(ptr %p), !tbaa !19  ; TAG_Y_x_i
   %c = add i32 %a, %b
@@ -115,12 +142,6 @@ define i32 @test10(ptr %p, ptr %q) {
 declare void @clobber()
 declare i32 @foo(ptr) readonly
 
-; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
 !0 = !{!5, !5, i64 0}
 !1 = !{!6, !6, i64 0}
 !2 = !{!"tbaa root"}
@@ -132,9 +153,6 @@ declare i32 @foo(ptr) readonly
 !8 = !{!"another root"}
 !11 = !{!"scalar type", !8}
 
-; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
-; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
-; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
 !15 = !{!16, !17, i64 0}            ; TAG_X_i
 !16 = !{!"struct X", !17, i64 0}    ; struct X { int i; };
 !17 = !{!"int", !18, i64 0}
@@ -146,3 +164,19 @@ declare i32 @foo(ptr) readonly
 ; A TBAA structure who's only point is to have a constant location.
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"C", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"A", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"tbaa root"}
+; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"B", [[META2]]}
+; CHECK: [[TBAA6]] = !{[[META2]], [[META2]], i64 0}
+; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[META8]] = !{!"scalar type", [[META9:![0-9]+]]}
+; CHECK: [[META9]] = !{!"another root"}
+; CHECK: [[TBAA10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], i64 0}
+; CHECK: [[META11]] = !{!"struct X", [[META12]], i64 0}
+; CHECK: [[META12]] = !{!"int", [[META13:![0-9]+]], i64 0}
+; CHECK: [[META13]] = !{!"char", [[META3]], i64 0}
+;.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
index 85978ff450e09..51b8816b8bc00 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
@@ -1,11 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Idempotent fmuls -- should compile to just a ret.
 define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_f16(
-; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_f16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
   %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
@@ -13,8 +15,9 @@ define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x
 }
 
 define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_f32(
-; CHECK-NEXT:    ret <vscale x 4 x float> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 4 x float> @idempotent_fmul_f32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 ;
   %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
   %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
@@ -22,8 +25,9 @@ define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale
 }
 
 define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_f64(
-; CHECK-NEXT:    ret <vscale x 2 x double> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_f64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
   %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
@@ -31,8 +35,9 @@ define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale
 }
 
 define <vscale x 2 x double> @idempotent_fmul_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_different_argument_order(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> [[A:%.*]])
+; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> [[A]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
@@ -42,8 +47,9 @@ define <vscale x 2 x double> @idempotent_fmul_different_argument_order(<vscale x
 }
 
 define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_with_predicated_dup(
-; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half 1.0)
   %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
@@ -53,7 +59,8 @@ define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1
 define <vscale x 8 x half> @idempotent_fmul_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
   ; Edge case -- make sure that the case where we're fmultiplying two dups
   ; together is sane.
-; CHECK-LABEL: @idempotent_fmul_two_dups(
+; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
@@ -64,8 +71,9 @@ define <vscale x 8 x half> @idempotent_fmul_two_dups(<vscale x 8 x i1> %pg, <vsc
 
 ; Non-idempotent fmuls -- we don't expect these to be optimised out.
 define <vscale x 8 x half> @non_idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
-; CHECK-LABEL: @non_idempotent_fmul_f16(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH4000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 8 x half> @non_idempotent_fmul_f16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[PG]], <vscale x 8 x half> [[A]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH4000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0)
@@ -74,8 +82,9 @@ define <vscale x 8 x half> @non_idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vsca
 }
 
 define <vscale x 4 x float> @non_idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
-; CHECK-LABEL: @non_idempotent_fmul_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 4 x float> @non_idempotent_fmul_f32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[A]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0)
@@ -84,8 +93,9 @@ define <vscale x 4 x float> @non_idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vsc
 }
 
 define <vscale x 2 x double> @non_idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
-; CHECK-LABEL: @non_idempotent_fmul_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 2 x double> @non_idempotent_fmul_f64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[A]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0)
@@ -95,9 +105,10 @@ define <vscale x 2 x double> @non_idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vs
 
 define <vscale x 2 x double> @non_idempotent_fmul_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x double> %a) #0 {
   ; Different predicates
-; CHECK-LABEL: @non_idempotent_fmul_with_predicated_dup(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1:%.*]], double 1.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[TMP1]])
+; CHECK-LABEL: define <vscale x 2 x double> @non_idempotent_fmul_with_predicated_dup(
+; CHECK-SAME: <vscale x 2 x i1> [[PG1:%.*]], <vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG2]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg1, double 1.0)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
index 4ab1f954b33e9..5ad0731fbb0e6 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll
@@ -1,11 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Idempotent fmuls_u -- should compile to just a ret.
 define <vscale x 8 x half> @idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_u_f16(
-; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_u_f16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
   %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
@@ -13,8 +15,9 @@ define <vscale x 8 x half> @idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale
 }
 
 define <vscale x 4 x float> @idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_u_f32(
-; CHECK-NEXT:    ret <vscale x 4 x float> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 4 x float> @idempotent_fmul_u_f32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 ;
   %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0)
   %2 = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %1)
@@ -22,8 +25,9 @@ define <vscale x 4 x float> @idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscal
 }
 
 define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_u_f64(
-; CHECK-NEXT:    ret <vscale x 2 x double> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_u_f64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
   %2 = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %1)
@@ -31,8 +35,9 @@ define <vscale x 2 x double> @idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vsca
 }
 
 define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_u_different_argument_order(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> [[A:%.*]])
+; CHECK-LABEL: define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> [[A]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0)
@@ -42,8 +47,9 @@ define <vscale x 2 x double> @idempotent_fmul_u_different_argument_order(<vscale
 }
 
 define <vscale x 8 x half> @idempotent_fmul_u_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
-; CHECK-LABEL: @idempotent_fmul_u_with_predicated_dup(
-; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_u_with_predicated_dup(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half 1.0)
   %2 = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %1)
@@ -53,7 +59,8 @@ define <vscale x 8 x half> @idempotent_fmul_u_with_predicated_dup(<vscale x 8 x
 define <vscale x 8 x half> @idempotent_fmul_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
   ; Edge case -- make sure that the case where we're fmultiplying two dups
   ; together is sane.
-; CHECK-LABEL: @idempotent_fmul_u_two_dups(
+; CHECK-LABEL: define <vscale x 8 x half> @idempotent_fmul_u_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0)
@@ -64,8 +71,9 @@ define <vscale x 8 x half> @idempotent_fmul_u_two_dups(<vscale x 8 x i1> %pg, <v
 
 ; Non-idempotent fmuls_u -- we don't expect these to be optimised out.
 define <vscale x 8 x half> @non_idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
-; CHECK-LABEL: @non_idempotent_fmul_u_f16(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH4000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 8 x half> @non_idempotent_fmul_u_f16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.u.nxv8f16(<vscale x 8 x i1> [[PG]], <vscale x 8 x half> [[A]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH4000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0)
@@ -74,8 +82,9 @@ define <vscale x 8 x half> @non_idempotent_fmul_u_f16(<vscale x 8 x i1> %pg, <vs
 }
 
 define <vscale x 4 x float> @non_idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
-; CHECK-LABEL: @non_idempotent_fmul_u_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 4 x float> @non_idempotent_fmul_u_f32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.u.nxv4f32(<vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[A]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0)
@@ -84,8 +93,9 @@ define <vscale x 4 x float> @non_idempotent_fmul_u_f32(<vscale x 4 x i1> %pg, <v
 }
 
 define <vscale x 2 x double> @non_idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
-; CHECK-LABEL: @non_idempotent_fmul_u_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 2 x double> @non_idempotent_fmul_u_f64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[A]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 2.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0)
@@ -95,9 +105,10 @@ define <vscale x 2 x double> @non_idempotent_fmul_u_f64(<vscale x 2 x i1> %pg, <
 
 define <vscale x 2 x double> @non_idempotent_fmul_u_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x double> %a) #0 {
   ; Different predicates
-; CHECK-LABEL: @non_idempotent_fmul_u_with_predicated_dup(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1:%.*]], double 1.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[TMP1]])
+; CHECK-LABEL: define <vscale x 2 x double> @non_idempotent_fmul_u_with_predicated_dup(
+; CHECK-SAME: <vscale x 2 x i1> [[PG1:%.*]], <vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x double> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[PG2]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
 ;
   %1 = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg1, double 1.0)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
index 027f671712bed..83018200a521e 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
@@ -1,11 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Idempotent muls -- should compile to just a ret.
 define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_i16(
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
   %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
@@ -13,8 +15,9 @@ define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8
 }
 
 define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_i32(
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 ;
   %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
   %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
@@ -22,8 +25,9 @@ define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4
 }
 
 define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_i64(
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
   %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
@@ -31,8 +35,9 @@ define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2
 }
 
 define <vscale x 2 x i64> @idempotent_mul_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_different_argument_order(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> [[A:%.*]])
+; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> [[A]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
@@ -42,8 +47,9 @@ define <vscale x 2 x i64> @idempotent_mul_different_argument_order(<vscale x 2 x
 }
 
 define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_with_predicated_dup(
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 1)
   %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
@@ -53,7 +59,8 @@ define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1>
 define <vscale x 8 x i16> @idempotent_mul_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
   ; Edge case -- make sure that the case where we're multiplying two dups
   ; together is sane.
-; CHECK-LABEL: @idempotent_mul_two_dups(
+; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
@@ -64,8 +71,9 @@ define <vscale x 8 x i16> @idempotent_mul_two_dups(<vscale x 8 x i1> %pg, <vscal
 
 ; Non-idempotent muls -- we don't expect these to be optimised out.
 define <vscale x 8 x i16> @non_idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
-; CHECK-LABEL: @non_idempotent_mul_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 2, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 8 x i16> @non_idempotent_mul_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 2, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 2)
@@ -74,8 +82,9 @@ define <vscale x 8 x i16> @non_idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale
 }
 
 define <vscale x 4 x i32> @non_idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
-; CHECK-LABEL: @non_idempotent_mul_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 4 x i32> @non_idempotent_mul_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 2)
@@ -84,8 +93,9 @@ define <vscale x 4 x i32> @non_idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale
 }
 
 define <vscale x 2 x i64> @non_idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
-; CHECK-LABEL: @non_idempotent_mul_i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 2 x i64> @non_idempotent_mul_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 2)
@@ -95,9 +105,10 @@ define <vscale x 2 x i64> @non_idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale
 
 define <vscale x 2 x i64> @non_idempotent_mul_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x i64> %a) #0 {
   ; Different predicates
-; CHECK-LABEL: @non_idempotent_mul_with_predicated_dup(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG1:%.*]], i64 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-LABEL: define <vscale x 2 x i64> @non_idempotent_mul_with_predicated_dup(
+; CHECK-SAME: <vscale x 2 x i1> [[PG1:%.*]], <vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG1]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG2]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg1, i64 1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
index 3b11a0d887985..2e7475de0aa77 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll
@@ -1,11 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Idempotent muls -- should compile to just a ret.
 define <vscale x 8 x i16> @idempotent_mul_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_u_i16(
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_u_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
   %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
@@ -13,8 +15,9 @@ define <vscale x 8 x i16> @idempotent_mul_u_i16(<vscale x 8 x i1> %pg, <vscale x
 }
 
 define <vscale x 4 x i32> @idempotent_mul_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_u_i32(
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 4 x i32> @idempotent_mul_u_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 ;
   %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
   %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
@@ -22,8 +25,9 @@ define <vscale x 4 x i32> @idempotent_mul_u_i32(<vscale x 4 x i1> %pg, <vscale x
 }
 
 define <vscale x 2 x i64> @idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_u_i64(
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_u_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
   %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
@@ -31,8 +35,9 @@ define <vscale x 2 x i64> @idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vscale x
 }
 
 define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_u_different_argument_order(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> [[A:%.*]])
+; CHECK-LABEL: define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> [[A]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
@@ -42,8 +47,9 @@ define <vscale x 2 x i64> @idempotent_mul_u_different_argument_order(<vscale x 2
 }
 
 define <vscale x 8 x i16> @idempotent_mul_u_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
-; CHECK-LABEL: @idempotent_mul_u_with_predicated_dup(
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[A:%.*]]
+; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_u_with_predicated_dup(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 1)
   %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
@@ -53,7 +59,8 @@ define <vscale x 8 x i16> @idempotent_mul_u_with_predicated_dup(<vscale x 8 x i1
 define <vscale x 8 x i16> @idempotent_mul_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
   ; Edge case -- make sure that the case where we're multiplying two dups
   ; together is sane.
-; CHECK-LABEL: @idempotent_mul_u_two_dups(
+; CHECK-LABEL: define <vscale x 8 x i16> @idempotent_mul_u_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
@@ -64,8 +71,9 @@ define <vscale x 8 x i16> @idempotent_mul_u_two_dups(<vscale x 8 x i1> %pg, <vsc
 
 ; Non-idempotent muls -- we don't expect these to be optimised out.
 define <vscale x 8 x i16> @non_idempotent_mul_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
-; CHECK-LABEL: @non_idempotent_mul_u_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 2, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 8 x i16> @non_idempotent_mul_u_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 2, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 2)
@@ -74,8 +82,9 @@ define <vscale x 8 x i16> @non_idempotent_mul_u_i16(<vscale x 8 x i1> %pg, <vsca
 }
 
 define <vscale x 4 x i32> @non_idempotent_mul_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
-; CHECK-LABEL: @non_idempotent_mul_u_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 4 x i32> @non_idempotent_mul_u_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 2)
@@ -84,8 +93,9 @@ define <vscale x 4 x i32> @non_idempotent_mul_u_i32(<vscale x 4 x i1> %pg, <vsca
 }
 
 define <vscale x 2 x i64> @non_idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
-; CHECK-LABEL: @non_idempotent_mul_u_i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-LABEL: define <vscale x 2 x i64> @non_idempotent_mul_u_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 2)
@@ -95,9 +105,10 @@ define <vscale x 2 x i64> @non_idempotent_mul_u_i64(<vscale x 2 x i1> %pg, <vsca
 
 define <vscale x 2 x i64> @non_idempotent_mul_u_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x i64> %a) #0 {
   ; Different predicates
-; CHECK-LABEL: @non_idempotent_mul_u_with_predicated_dup(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG1:%.*]], i64 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[TMP1]])
+; CHECK-LABEL: define <vscale x 2 x i64> @non_idempotent_mul_u_with_predicated_dup(
+; CHECK-SAME: <vscale x 2 x i1> [[PG1:%.*]], <vscale x 2 x i1> [[PG2:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG1]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG2]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
   %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg1, i64 1)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 779def76fc58d..5fdb918c87545 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -6547,3 +6547,21 @@ define half @test_constant_fold_exp2_f16_neg_denorm() {
   %val = call half @llvm.amdgcn.exp2.f16(half 0xH83ff)
   ret half %val
 }
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.prng
+; --------------------------------------------------------------------
+declare i32 @llvm.amdgcn.prng.b32(i32)
+define i32 @prng_undef_i32() {
+; CHECK-LABEL: @prng_undef_i32(
+; CHECK-NEXT:    ret i32 undef
+  %prng = call i32 @llvm.amdgcn.prng.b32(i32 undef)
+  ret i32 %prng
+}
+
+define i32 @prng_poison_i32() {
+; CHECK-LABEL: @prng_poison_i32(
+; CHECK-NEXT:    ret i32 poison
+  %prng = call i32 @llvm.amdgcn.prng.b32(i32 poison)
+  ret i32 %prng
+}
diff --git a/llvm/test/Transforms/InstCombine/bit_ceil.ll b/llvm/test/Transforms/InstCombine/bit_ceil.ll
index 0551a5cb5e2f2..a2e27dfd6f64d 100644
--- a/llvm/test/Transforms/InstCombine/bit_ceil.ll
+++ b/llvm/test/Transforms/InstCombine/bit_ceil.ll
@@ -320,6 +320,23 @@ define i32 @pr91691_keep_nsw(i32 %0) {
   ret i32 %7
 }
 
+define i32 @test_drop_range_attr(i32 %x) {
+; CHECK-LABEL: @test_drop_range_attr(
+; CHECK-NEXT:    [[CTLZ:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[CTLZ]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 31
+; CHECK-NEXT:    [[SEL:%.*]] = shl nuw i32 1, [[TMP2]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %ctlz = call range(i32 1, 33) i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %sub = sub i32 32, %ctlz
+  %shl = shl i32 1, %sub
+  %dec = add i32 %x, -1
+  %ult = icmp ult i32 %dec, -2
+  %sel = select i1 %ult, i32 %shl, i32 1
+  ret i32 %sel
+}
+
 declare i32 @llvm.ctlz.i32(i32, i1 immarg)
 declare i64 @llvm.ctlz.i64(i64, i1 immarg)
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
diff --git a/llvm/test/Transforms/InstCombine/conditional-variable-length-signext-after-high-bit-extract.ll b/llvm/test/Transforms/InstCombine/conditional-variable-length-signext-after-high-bit-extract.ll
index 5569af452de4f..b00b3a289de47 100644
--- a/llvm/test/Transforms/InstCombine/conditional-variable-length-signext-after-high-bit-extract.ll
+++ b/llvm/test/Transforms/InstCombine/conditional-variable-length-signext-after-high-bit-extract.ll
@@ -1137,3 +1137,18 @@ define i32 @n290_or_with_wrong_magic(i32 %data, i32 %nbits) {
   %signextended = or i32 %high_bits_extracted, %magic
   ret i32 %signextended
 }
+
+define i32 @bitwidth_does_not_fit(i3 %arg) {
+; CHECK-LABEL: @bitwidth_does_not_fit(
+; CHECK-NEXT:    [[NEG:%.*]] = sub i3 0, [[ARG:%.*]]
+; CHECK-NEXT:    [[NEG_EXT:%.*]] = zext i3 [[NEG]] to i32
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 1, [[NEG_EXT]]
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[SHR]], 1
+; CHECK-NEXT:    ret i32 [[INC]]
+;
+  %neg = sub i3 0, %arg
+  %neg.ext = zext i3 %neg to i32
+  %shr = lshr i32 1, %neg.ext
+  %inc = add i32 %shr, 1
+  ret i32 %inc
+}
diff --git a/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll b/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
index 74a1e318d77ed..9a723e8bc89ff 100644
--- a/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/create-class-from-logic-fcmp.ll
@@ -990,7 +990,8 @@ define i1 @not_isnormalinf_or_inf(half %x) #0 {
 ; -> subnormal | zero | nan
 define i1 @not_isnormalinf_or_uno(half %x) #0 {
 ; CHECK-LABEL: @not_isnormalinf_or_uno(
-; CHECK-NEXT:    [[OR:%.*]] = call i1 @llvm.is.fpclass.f16(half [[X:%.*]], i32 243)
+; CHECK-NEXT:    [[FABS:%.*]] = call half @llvm.fabs.f16(half [[X:%.*]])
+; CHECK-NEXT:    [[OR:%.*]] = fcmp ult half [[FABS]], 0xH0400
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %fabs = call half @llvm.fabs.f16(half %x)
@@ -1003,7 +1004,8 @@ define i1 @not_isnormalinf_or_uno(half %x) #0 {
 ; -> subnormal | zero | nan
 define i1 @not_isnormalinf_or_uno_nofabs(half %x) #0 {
 ; CHECK-LABEL: @not_isnormalinf_or_uno_nofabs(
-; CHECK-NEXT:    [[OR:%.*]] = call i1 @llvm.is.fpclass.f16(half [[X:%.*]], i32 243)
+; CHECK-NEXT:    [[FABS:%.*]] = call half @llvm.fabs.f16(half [[X:%.*]])
+; CHECK-NEXT:    [[OR:%.*]] = fcmp ult half [[FABS]], 0xH0400
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %fabs = call half @llvm.fabs.f16(half %x)
diff --git a/llvm/test/Transforms/InstCombine/extractelement.ll b/llvm/test/Transforms/InstCombine/extractelement.ll
index 28a4702559c46..2bd719e236137 100644
--- a/llvm/test/Transforms/InstCombine/extractelement.ll
+++ b/llvm/test/Transforms/InstCombine/extractelement.ll
@@ -722,20 +722,14 @@ define i8 @bitcast_scalar_index_variable(i32 %x, i64 %y) {
   ret i8 %r
 }
 
-; extra use is ok if we don't need a shift
+; extra use is not ok, even if we don't need a shift
 
 define i8 @bitcast_scalar_index0_use(i64 %x) {
-; ANYLE-LABEL: @bitcast_scalar_index0_use(
-; ANYLE-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8>
-; ANYLE-NEXT:    call void @use(<8 x i8> [[V]])
-; ANYLE-NEXT:    [[R:%.*]] = trunc i64 [[X]] to i8
-; ANYLE-NEXT:    ret i8 [[R]]
-;
-; ANYBE-LABEL: @bitcast_scalar_index0_use(
-; ANYBE-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8>
-; ANYBE-NEXT:    call void @use(<8 x i8> [[V]])
-; ANYBE-NEXT:    [[R:%.*]] = extractelement <8 x i8> [[V]], i64 0
-; ANYBE-NEXT:    ret i8 [[R]]
+; ANY-LABEL: @bitcast_scalar_index0_use(
+; ANY-NEXT:    [[V:%.*]] = bitcast i64 [[X:%.*]] to <8 x i8>
+; ANY-NEXT:    call void @use(<8 x i8> [[V]])
+; ANY-NEXT:    [[R:%.*]] = extractelement <8 x i8> [[V]], i64 0
+; ANY-NEXT:    ret i8 [[R]]
 ;
 
   %v = bitcast i64 %x to <8 x i8>
diff --git a/llvm/test/Transforms/InstCombine/intersect-accessgroup.ll b/llvm/test/Transforms/InstCombine/intersect-accessgroup.ll
index 2236efd5aaad9..5c4d95ebf4831 100644
--- a/llvm/test/Transforms/InstCombine/intersect-accessgroup.ll
+++ b/llvm/test/Transforms/InstCombine/intersect-accessgroup.ll
@@ -12,12 +12,9 @@
 ; 				}
 ; }
 ;
-; Check for correctly merging access group metadata for instcombine
-; (only common loops are parallel == intersection)
-; Note that combined load would be parallel to loop !16 since both
-; origin loads are parallel to it, but it references two access groups
-; (!8 and !9), neither of which contain both loads. As such, the
-; information that the combined load is parallel to !16 is lost.
+; Check that the original access group on %0 is preserved when replacing uses
+; of %1 with it, as %0 is not moved and if %0 would not be parallel in the
+; original loop it would be UB.
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -107,7 +104,9 @@ for.end32:
 ; CHECK: load double, {{.*}} !llvm.access.group ![[ACCESSGROUP_0:[0-9]+]]
 ; CHECK: br label %for.cond14, !llvm.loop ![[LOOP_4:[0-9]+]]
 
-; CHECK: ![[ACCESSGROUP_0]] = distinct !{}
+; CHECK: ![[ACCESSGROUP_0]] = !{![[G1:[0-9]+]], ![[G2:[0-9]+]]}
+; CHECK: ![[G1]] = distinct !{}
+; CHECK: ![[G2]] = distinct !{}
 
 ; CHECK: ![[LOOP_4]] = distinct !{![[LOOP_4]], ![[PARALLEL_ACCESSES_5:[0-9]+]]}
-; CHECK: ![[PARALLEL_ACCESSES_5]] = !{!"llvm.loop.parallel_accesses", ![[ACCESSGROUP_0]]}
+; CHECK: ![[PARALLEL_ACCESSES_5]] = !{!"llvm.loop.parallel_accesses", ![[G1]]}
diff --git a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
index dc9daf5265d37..3fdba7cfae67e 100644
--- a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
+++ b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
@@ -201,12 +201,11 @@ entry:
   ret ptr %l.sel
 }
 
-; FIXME: Should preserve metadata on loads.
 define double @preserve_load_metadata_after_select_transform2(ptr %a, ptr %b) {
 ; CHECK-LABEL: @preserve_load_metadata_after_select_transform2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -220,12 +219,11 @@ entry:
   ret double %l.sel
 }
 
-; FIXME: Should preserve metadata on loads.
 define double @preserve_load_metadata_after_select_transform_metadata_missing_1(ptr %a, ptr %b) {
 ; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -242,8 +240,8 @@ entry:
 define double @preserve_load_metadata_after_select_transform_metadata_missing_2(ptr %a, ptr %b) {
 ; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !llvm.access.group [[META6]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -257,12 +255,11 @@ entry:
   ret double %l.sel
 }
 
-; FIXME: Should preserve metadata on loads.
 define double @preserve_load_metadata_after_select_transform_metadata_missing_3(ptr %a, ptr %b) {
 ; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
 ; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
 ; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
 ; CHECK-NEXT:    ret double [[L_SEL]]
@@ -272,7 +269,27 @@ entry:
   %l.b = load double, ptr %b, align 8, !tbaa !0, !llvm.access.group !7
   %cmp.i = fcmp fast olt double %l.a, %l.b
   %ptr.sel = select i1 %cmp.i, ptr %b, ptr %a
-  %l.sel = load double, ptr %ptr.sel, align 8, !tbaa !0, !llvm.access.group !12
+  %l.sel = load double, ptr %ptr.sel, align 8, !tbaa !0, !llvm.access.group !13
+  ret double %l.sel
+}
+
+; Like preserve_load_metadata_after_select_transform_metadata_missing_3, but
+; with different access groups on all loads.
+define double @preserve_load_metadata_after_select_transform_metadata_missing_4(ptr %a, ptr %b) {
+; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[META6]]
+; CHECK-NEXT:    [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP10:![0-9]+]]
+; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]]
+; CHECK-NEXT:    [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]]
+; CHECK-NEXT:    ret double [[L_SEL]]
+;
+entry:
+  %l.a = load double, ptr %a, align 8, !tbaa !0, !llvm.access.group !7
+  %l.b = load double, ptr %b, align 8, !tbaa !0, !llvm.access.group !12
+  %cmp.i = fcmp fast olt double %l.a, %l.b
+  %ptr.sel = select i1 %cmp.i, ptr %b, ptr %a
+  %l.sel = load double, ptr %ptr.sel, align 8, !tbaa !0, !llvm.access.group !13
   ret double %l.sel
 }
 
@@ -288,7 +305,8 @@ entry:
 !9 = !{i64 8}
 !10 = distinct !{}
 !11 = !{i32 5, i32 6}
-!12 = !{}
+!12 = distinct !{}
+!13 = distinct !{}
 ;.
 ; CHECK: [[TBAA0]] = !{[[LOOP1]], [[LOOP1]], i64 0}
 ; CHECK: [[LOOP1]] = !{!"scalar type", [[META2:![0-9]+]]}
@@ -300,4 +318,5 @@ entry:
 ; CHECK: [[META7]] = !{i32 1}
 ; CHECK: [[META8]] = !{i64 8}
 ; CHECK: [[ACC_GRP9]] = distinct !{}
+; CHECK: [[ACC_GRP10]] = distinct !{}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/ptrmask.ll b/llvm/test/Transforms/InstCombine/ptrmask.ll
index bcbd78e23ed67..54d1417ff9fd9 100644
--- a/llvm/test/Transforms/InstCombine/ptrmask.ll
+++ b/llvm/test/Transforms/InstCombine/ptrmask.ll
@@ -578,3 +578,16 @@ define ptr @ptrmask_is_useless_fail1(i64 %i, i64 %m) {
   %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
   ret ptr %r
 }
+
+@GC_arrays = external global { i8, i8, i64 }
+
+define ptr @ptrmask_demandedbits_constantexpr() {
+; CHECK-LABEL: define ptr @ptrmask_demandedbits_constantexpr() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALIGNED_RESULT:%.*]] = call align 8 ptr @llvm.ptrmask.p0.i64(ptr nonnull @GC_arrays, i64 -8)
+; CHECK-NEXT:    ret ptr [[ALIGNED_RESULT]]
+;
+entry:
+  %aligned_result = call ptr @llvm.ptrmask.p0.i64(ptr getelementptr inbounds (i8, ptr @GC_arrays, i64 1), i64 -8)
+  ret ptr %aligned_result
+}
diff --git a/llvm/test/Transforms/InstCombine/scalable-const-fp-splat.ll b/llvm/test/Transforms/InstCombine/scalable-const-fp-splat.ll
index b3b20609296d6..00a815322cd24 100644
--- a/llvm/test/Transforms/InstCombine/scalable-const-fp-splat.ll
+++ b/llvm/test/Transforms/InstCombine/scalable-const-fp-splat.ll
@@ -1,9 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=instcombine -S -o - < %s | FileCheck %s
 
 define <vscale x 2 x float> @shrink_splat_scalable_extend(<vscale x 2 x float> %a) {
-  ; CHECK-LABEL: @shrink_splat_scalable_extend
-  ; CHECK-NEXT:  %[[FADD:.*]] = fadd <vscale x 2 x float> %a, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float -1.000000e+00, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
-  ; CHECK-NEXT:  ret <vscale x 2 x float> %[[FADD]]
+; CHECK-LABEL: define <vscale x 2 x float> @shrink_splat_scalable_extend(
+; CHECK-SAME: <vscale x 2 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <vscale x 2 x float> [[A]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float -1.000000e+00, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x float> [[TMP1]]
+;
   %1 = shufflevector <vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float -1.000000e+00, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
   %2 = fpext <vscale x 2 x float> %a to <vscale x 2 x double>
   %3 = fpext <vscale x 2 x float> %1 to <vscale x 2 x double>
diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
index cd8c29ba4cd81..e563cafbc7e4f 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -706,11 +706,10 @@ define i32 @select_lshr_icmp_const_different_values(i32 %x, i32 %y) {
   ret i32 %C
 }
 
-; Invalid identity constant for FP op
-define float @select_fadd_fcmp_bad(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fadd_fcmp_bad(
+define float @select_fadd_fcmp_equiv(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fadd_fcmp_equiv(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -1.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], -1.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -720,6 +719,19 @@ define float @select_fadd_fcmp_bad(float %x, float %y, float %z) {
   ret float %C
 }
 
+define float @select_fadd_fcmp_equiv2(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fadd_fcmp_equiv2(
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], -1.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp une float %x, -1.0
+  %B = fadd nsz float %x, %z
+  %C = select i1 %A, float %y, float %B
+  ret float %C
+}
+
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_2(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_2(
@@ -893,24 +905,10 @@ define float @select_fadd_fcmp_bad_13(float %x, float %y, float %z) {
   ret float %C
 }
 
-; Invalid identity constant for FP op
-define float @select_fadd_fcmp_bad_14(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fadd_fcmp_bad_14(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -1.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[X]], [[Z:%.*]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
-; CHECK-NEXT:    ret float [[C]]
-;
-  %A = fcmp une float %x, -1.0
-  %B = fadd nsz float %x, %z
-  %C = select i1 %A, float %y, float %B
-  ret float %C
-}
-
-define float @select_fmul_fcmp_bad(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fmul_fcmp_bad(
+define float @select_fmul_fcmp_equiv(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fmul_fcmp_equiv(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 3.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fmul nsz float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fmul nsz float [[Z:%.*]], 3.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -920,11 +918,10 @@ define float @select_fmul_fcmp_bad(float %x, float %y, float %z) {
   ret float %C
 }
 
-define float @select_fmul_fcmp_bad_2(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fmul_fcmp_bad_2(
+define float @select_fmul_fcmp_equiv2(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fmul_fcmp_equiv2(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fmul float [[X]], [[Z:%.*]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B:%.*]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, 1.0
@@ -959,10 +956,10 @@ define float @select_fmul_icmp_bad_2(float %x, float %y, float %z, i32 %k) {
   ret float %C
 }
 
-define float @select_fdiv_fcmp_bad(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fdiv_fcmp_bad(
+define float @select_fdiv_fcmp_equiv(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fdiv_fcmp_equiv(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fdiv float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fdiv float 1.000000e+00, [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -972,10 +969,10 @@ define float @select_fdiv_fcmp_bad(float %x, float %y, float %z) {
   ret float %C
 }
 
-define float @select_fdiv_fcmp_bad_2(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fdiv_fcmp_bad_2(
+define float @select_fdiv_fcmp_equiv2(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fdiv_fcmp_equiv2(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 3.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fdiv nsz float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fdiv nsz float 3.000000e+00, [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -1001,10 +998,10 @@ define float @select_fsub_fcmp_bad(float %x, float %y, float %z) {
   ret float %C
 }
 
-define float @select_fsub_fcmp_bad_2(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fsub_fcmp_bad_2(
+define float @select_fsub_fcmp_equiv(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fsub_fcmp_equiv(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fsub nsz float [[Z:%.*]], [[X]]
+; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], -1.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-value-equivalence.ll b/llvm/test/Transforms/InstCombine/select-value-equivalence.ll
index 433fafc7e553b..da2e59d760f96 100644
--- a/llvm/test/Transforms/InstCombine/select-value-equivalence.ll
+++ b/llvm/test/Transforms/InstCombine/select-value-equivalence.ll
@@ -90,7 +90,7 @@ define float @select_fcmp_fadd_oeq_not_zero(float %x, float %y) {
 ; CHECK-LABEL: define float @select_fcmp_fadd_oeq_not_zero(
 ; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp oeq float [[Y]], 2.000000e+00
-; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[X]], [[Y]]
+; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[X]], 2.000000e+00
 ; CHECK-NEXT:    [[RETVAL:%.*]] = select i1 [[FCMP]], float [[FADD]], float [[X]]
 ; CHECK-NEXT:    ret float [[RETVAL]]
 ;
@@ -104,7 +104,7 @@ define float @select_fcmp_fadd_une_not_zero(float %x, float %y) {
 ; CHECK-LABEL: define float @select_fcmp_fadd_une_not_zero(
 ; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp une float [[Y]], 2.000000e+00
-; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[X]], [[Y]]
+; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[X]], 2.000000e+00
 ; CHECK-NEXT:    [[RETVAL:%.*]] = select i1 [[FCMP]], float [[X]], float [[FADD]]
 ; CHECK-NEXT:    ret float [[RETVAL]]
 ;
@@ -118,7 +118,7 @@ define float @select_fcmp_fadd_ueq_nnan_not_zero(float %x, float %y) {
 ; CHECK-LABEL: define float @select_fcmp_fadd_ueq_nnan_not_zero(
 ; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp nnan ueq float [[Y]], 2.000000e+00
-; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[X]], [[Y]]
+; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[X]], 2.000000e+00
 ; CHECK-NEXT:    [[RETVAL:%.*]] = select i1 [[FCMP]], float [[FADD]], float [[X]]
 ; CHECK-NEXT:    ret float [[RETVAL]]
 ;
@@ -132,7 +132,7 @@ define float @select_fcmp_fadd_one_nnan_not_zero(float %x, float %y) {
 ; CHECK-LABEL: define float @select_fcmp_fadd_one_nnan_not_zero(
 ; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp nnan one float [[Y]], 2.000000e+00
-; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[X]], [[Y]]
+; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[X]], 2.000000e+00
 ; CHECK-NEXT:    [[RETVAL:%.*]] = select i1 [[FCMP]], float [[X]], float [[FADD]]
 ; CHECK-NEXT:    ret float [[RETVAL]]
 ;
@@ -202,7 +202,7 @@ define <2 x float> @select_fcmp_fadd_oeq_not_zero_vec(<2 x float> %x, <2 x float
 ; CHECK-LABEL: define <2 x float> @select_fcmp_fadd_oeq_not_zero_vec(
 ; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp oeq <2 x float> [[Y]], splat (float 2.000000e+00)
-; CHECK-NEXT:    [[FADD:%.*]] = fadd <2 x float> [[X]], [[Y]]
+; CHECK-NEXT:    [[FADD:%.*]] = fadd <2 x float> [[X]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[RETVAL:%.*]] = select <2 x i1> [[FCMP]], <2 x float> [[FADD]], <2 x float> [[X]]
 ; CHECK-NEXT:    ret <2 x float> [[RETVAL]]
 ;
@@ -216,7 +216,7 @@ define <2 x float> @select_fcmp_fadd_une_not_zero_vec(<2 x float> %x, <2 x float
 ; CHECK-LABEL: define <2 x float> @select_fcmp_fadd_une_not_zero_vec(
 ; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp une <2 x float> [[Y]], splat (float 2.000000e+00)
-; CHECK-NEXT:    [[FADD:%.*]] = fadd <2 x float> [[X]], [[Y]]
+; CHECK-NEXT:    [[FADD:%.*]] = fadd <2 x float> [[X]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[RETVAL:%.*]] = select <2 x i1> [[FCMP]], <2 x float> [[X]], <2 x float> [[FADD]]
 ; CHECK-NEXT:    ret <2 x float> [[RETVAL]]
 ;
@@ -230,7 +230,7 @@ define <2 x float> @select_fcmp_fadd_ueq_nnan_not_zero_vec(<2 x float> %x, <2 x
 ; CHECK-LABEL: define <2 x float> @select_fcmp_fadd_ueq_nnan_not_zero_vec(
 ; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp nnan ueq <2 x float> [[Y]], splat (float 2.000000e+00)
-; CHECK-NEXT:    [[FADD:%.*]] = fadd <2 x float> [[X]], [[Y]]
+; CHECK-NEXT:    [[FADD:%.*]] = fadd <2 x float> [[X]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[RETVAL:%.*]] = select <2 x i1> [[FCMP]], <2 x float> [[FADD]], <2 x float> [[X]]
 ; CHECK-NEXT:    ret <2 x float> [[RETVAL]]
 ;
@@ -244,7 +244,7 @@ define <2 x float> @select_fcmp_fadd_one_nnan_not_zero_vec(<2 x float> %x, <2 x
 ; CHECK-LABEL: define <2 x float> @select_fcmp_fadd_one_nnan_not_zero_vec(
 ; CHECK-SAME: <2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp nnan one <2 x float> [[Y]], splat (float 2.000000e+00)
-; CHECK-NEXT:    [[FADD:%.*]] = fadd <2 x float> [[X]], [[Y]]
+; CHECK-NEXT:    [[FADD:%.*]] = fadd <2 x float> [[X]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[RETVAL:%.*]] = select <2 x i1> [[FCMP]], <2 x float> [[X]], <2 x float> [[FADD]]
 ; CHECK-NEXT:    ret <2 x float> [[RETVAL]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll
index 1c381d0839071..63caec9501325 100644
--- a/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll
+++ b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll
@@ -15,6 +15,22 @@ entry:
   ret i32 %res
 }
 
+; Make sure that noundef is dropped.
+
+define i32 @shl_cttz_false_noundef(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_cttz_false_noundef(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[Y]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = shl i32 [[X]], [[CTTZ]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cttz = call noundef i32 @llvm.cttz.i32(i32 %y, i1 false)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
 define i32 @shl_ctlz_false(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i32 @shl_ctlz_false(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
diff --git a/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll b/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
index f745a40364211..ab2a7faa107c7 100644
--- a/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
@@ -90,4 +90,26 @@ define <4 x float> @bazzzzzz(<4 x float> %x, i32 %a) {
   ret <4 x float> %ins1
 }
 
+; test that foldBitcastExtElt doesn't interfere with shuffle folding
+
+define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) {
+; CHECK-LABEL: @bitcast_extract_insert_to_shuffle(
+; CHECK-NEXT:    [[AVEC:%.*]] = bitcast i32 [[A:%.*]] to <2 x half>
+; CHECK-NEXT:    [[BVEC:%.*]] = bitcast i32 [[B:%.*]] to <2 x half>
+; CHECK-NEXT:    [[INS3:%.*]] = shufflevector <2 x half> [[AVEC]], <2 x half> [[BVEC]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x half> [[INS3]]
+;
+  %avec = bitcast i32 %a to <2 x half>
+  %a0 = extractelement <2 x half> %avec, i32 0
+  %a1 = extractelement <2 x half> %avec, i32 1
+  %bvec = bitcast i32 %b to <2 x half>
+  %b0 = extractelement <2 x half> %bvec, i32 0
+  %b1 = extractelement <2 x half> %bvec, i32 1
+  %ins0 = insertelement <4 x half> undef, half %a0, i32 0
+  %ins1 = insertelement <4 x half> %ins0, half %a1, i32 1
+  %ins2 = insertelement <4 x half> %ins1, half %b0, i32 2
+  %ins3 = insertelement <4 x half> %ins2, half %b1, i32 3
+  ret <4 x half> %ins3
+}
+
 
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/extractelement-vscale.ll b/llvm/test/Transforms/InstSimplify/ConstProp/extractelement-vscale.ll
index d0f9b2276177e..de2ee65d8ec9b 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/extractelement-vscale.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/extractelement-vscale.ll
@@ -1,21 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
 
-; CHECK-LABEL: definitely_in_bounds
-; CHECK: ret i8 0
 define i8 @definitely_in_bounds() {
+; CHECK-LABEL: define i8 @definitely_in_bounds() {
+; CHECK-NEXT:    ret i8 0
+;
   ret i8 extractelement (<vscale x 16 x i8> zeroinitializer, i64 15)
 }
 
-; CHECK-LABEL: maybe_in_bounds
-; CHECK: ret i8 extractelement (<vscale x 16 x i8> zeroinitializer, i64 16)
 define i8 @maybe_in_bounds() {
+; CHECK-LABEL: define i8 @maybe_in_bounds() {
+; CHECK-NEXT:    ret i8 extractelement (<vscale x 16 x i8> zeroinitializer, i64 16)
+;
   ret i8 extractelement (<vscale x 16 x i8> zeroinitializer, i64 16)
 }
 
 ; Examples of extracting a lane from a splat constant
 
 define i32 @extractconstant_shuffle_in_range(i32 %v) {
-; CHECK-LABEL: @extractconstant_shuffle_in_range(
+; CHECK-LABEL: define i32 @extractconstant_shuffle_in_range(
+; CHECK-SAME: i32 [[V:%.*]]) {
 ; CHECK-NEXT:    ret i32 1024
 ;
   %in = insertelement <vscale x 4 x i32> undef, i32 1024, i32 0
@@ -25,7 +29,8 @@ define i32 @extractconstant_shuffle_in_range(i32 %v) {
 }
 
 define i32 @extractconstant_shuffle_maybe_out_of_range(i32 %v) {
-; CHECK-LABEL: @extractconstant_shuffle_maybe_out_of_range(
+; CHECK-LABEL: define i32 @extractconstant_shuffle_maybe_out_of_range(
+; CHECK-SAME: i32 [[V:%.*]]) {
 ; CHECK-NEXT:    ret i32 extractelement (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1024, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer), i32 4)
 ;
   %in = insertelement <vscale x 4 x i32> undef, i32 1024, i32 0
@@ -35,7 +40,8 @@ define i32 @extractconstant_shuffle_maybe_out_of_range(i32 %v) {
 }
 
 define i32 @extractconstant_shuffle_invalid_index(i32 %v) {
-; CHECK-LABEL: @extractconstant_shuffle_invalid_index(
+; CHECK-LABEL: define i32 @extractconstant_shuffle_invalid_index(
+; CHECK-SAME: i32 [[V:%.*]]) {
 ; CHECK-NEXT:    ret i32 extractelement (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1024, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer), i32 -1)
 ;
   %in = insertelement <vscale x 4 x i32> undef, i32 1024, i32 0
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector-inseltpoison.ll
index c37dbd6d3a350..912180eee6bae 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector-inseltpoison.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=early-cse -earlycse-debug-hash -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -14,26 +15,24 @@ target triple = "aarch64"
 ; see that the instruction can be passed to SimplifyInstruction and not crash
 ; the compiler. It happens to be the case that this will be the result.
 
-; CHECK-LABEL: define <vscale x 8 x i1> @vscale_version()
-; CHECK-NEXT: ret <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer)
 
 define <vscale x 8 x i1> @vscale_version() {
+; CHECK-LABEL: define <vscale x 8 x i1> @vscale_version() {
+; CHECK-NEXT:    ret <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer)
+;
   %splatter = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
-  %foo = shufflevector <vscale x 8 x i1> %splatter,
-                       <vscale x 8 x i1> undef,
-                       <vscale x 8 x i32> zeroinitializer
+  %foo = shufflevector <vscale x 8 x i1> %splatter, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
   ret <vscale x 8 x i1> %foo
 }
 
 ; The non-scalable version should be optimized as normal.
 
-; CHECK-LABEL: define <8 x i1> @fixed_length_version() {
-; CHECK-NEXT:  ret <8 x i1> splat (i1 true)
 define <8 x i1> @fixed_length_version() {
+; CHECK-LABEL: define <8 x i1> @fixed_length_version() {
+; CHECK-NEXT:    ret <8 x i1> splat (i1 true)
+;
   %splatter = insertelement <8 x i1> poison, i1 true, i32 0
-  %foo = shufflevector <8 x i1> %splatter,
-                       <8 x i1> undef,
-                       <8 x i32> zeroinitializer
+  %foo = shufflevector <8 x i1> %splatter, <8 x i1> undef, <8 x i32> zeroinitializer
   ret <8 x i1> %foo
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector.ll
index b575e656f552f..6b88b1d2a934c 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-shufflevector.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=early-cse -earlycse-debug-hash -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -14,26 +15,24 @@ target triple = "aarch64"
 ; see that the instruction can be passed to SimplifyInstruction and not crash
 ; the compiler. It happens to be the case that this will be the result.
 
-; CHECK-LABEL: define <vscale x 8 x i1> @vscale_version()
-; CHECK-NEXT: ret <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> undef, i1 true, i32 0), <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer)
 
 define <vscale x 8 x i1> @vscale_version() {
+; CHECK-LABEL: define <vscale x 8 x i1> @vscale_version() {
+; CHECK-NEXT:    ret <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> undef, i1 true, i32 0), <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer)
+;
   %splatter = insertelement <vscale x 8 x i1> undef, i1 true, i32 0
-  %foo = shufflevector <vscale x 8 x i1> %splatter,
-                       <vscale x 8 x i1> undef,
-                       <vscale x 8 x i32> zeroinitializer
+  %foo = shufflevector <vscale x 8 x i1> %splatter, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
   ret <vscale x 8 x i1> %foo
 }
 
 ; The non-scalable version should be optimized as normal.
 
-; CHECK-LABEL: define <8 x i1> @fixed_length_version() {
-; CHECK-NEXT:  ret <8 x i1> splat (i1 true)
 define <8 x i1> @fixed_length_version() {
+; CHECK-LABEL: define <8 x i1> @fixed_length_version() {
+; CHECK-NEXT:    ret <8 x i1> splat (i1 true)
+;
   %splatter = insertelement <8 x i1> undef, i1 true, i32 0
-  %foo = shufflevector <8 x i1> %splatter,
-                       <8 x i1> undef,
-                       <8 x i32> zeroinitializer
+  %foo = shufflevector <8 x i1> %splatter, <8 x i1> undef, <8 x i32> zeroinitializer
   ret <8 x i1> %foo
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll b/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
index 3a8bf53b32cab..6aa0adb2f0e67 100644
--- a/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
+++ b/llvm/test/Transforms/InstSimplify/logic-of-fcmps.ll
@@ -426,3 +426,68 @@ define i1 @olt_implies_olt_fail(float %x, float %y) {
   %ret = and i1 %olt, %olt2
   ret i1 %ret
 }
+
+define i1 @and_ord_olt_abs(float %x, float %y) {
+; CHECK-LABEL: @and_ord_olt_abs(
+; CHECK-NEXT:    [[ABSX:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[ABSX]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = fcmp ord float %x, 0.000000e+00
+  %absx = call float @llvm.fabs.f32(float %x)
+  %cmp2 = fcmp olt float %absx, %y
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @and_ord_olt_abs_commuted1(float %x, float %y) {
+; CHECK-LABEL: @and_ord_olt_abs_commuted1(
+; CHECK-NEXT:    [[ABSX:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[Y:%.*]], [[ABSX]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = fcmp ord float %x, 0.000000e+00
+  %absx = call float @llvm.fabs.f32(float %x)
+  %cmp2 = fcmp olt float %y, %absx
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @and_ord_olt_abs_commuted2(float %x, float %y) {
+; CHECK-LABEL: @and_ord_olt_abs_commuted2(
+; CHECK-NEXT:    [[ABSX:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[ABSX]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = fcmp ord float %x, 0.000000e+00
+  %absx = call float @llvm.fabs.f32(float %x)
+  %cmp2 = fcmp olt float %absx, %y
+  %and = and i1 %cmp2, %cmp1
+  ret i1 %and
+}
+
+define i1 @or_ord_ult_abs(float %x, float %y) {
+; CHECK-LABEL: @or_ord_ult_abs(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp1 = fcmp ord float %x, 0.000000e+00
+  %absx = call float @llvm.fabs.f32(float %x)
+  %cmp2 = fcmp ult float %absx, %y
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @and_ord_olt_absz(float %x, float %y, float %z) {
+; CHECK-LABEL: @and_ord_olt_absz(
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ord float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[ABSZ:%.*]] = call float @llvm.fabs.f32(float [[Z:%.*]])
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[ABSZ]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = fcmp ord float %x, 0.000000e+00
+  %absz = call float @llvm.fabs.f32(float %z)
+  %cmp2 = fcmp olt float %absz, %y
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
diff --git a/llvm/test/Transforms/JumpThreading/PR33357-lvi-recursion.ll b/llvm/test/Transforms/JumpThreading/PR33357-lvi-recursion.ll
index 76ee63b37734c..a36f81bda55ab 100644
--- a/llvm/test/Transforms/JumpThreading/PR33357-lvi-recursion.ll
+++ b/llvm/test/Transforms/JumpThreading/PR33357-lvi-recursion.ll
@@ -12,7 +12,7 @@
 define void @f(i32 %p1) {
 bb0:
   %0 = icmp eq i32 %p1, 0
-  br i1 undef, label %bb6, label %bb1
+  br i1 true, label %bb6, label %bb1
 
 bb1:
   br label %bb2
diff --git a/llvm/test/Transforms/JumpThreading/basic.ll b/llvm/test/Transforms/JumpThreading/basic.ll
index bb8eeb7137164..f37843903a5b1 100644
--- a/llvm/test/Transforms/JumpThreading/basic.ll
+++ b/llvm/test/Transforms/JumpThreading/basic.ll
@@ -60,12 +60,11 @@ F2:
 }
 
 
-; Undef handling.
 define i32 @test3(i1 %cond) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT: T1:
 ; CHECK-NEXT: ret i32 42
-	br i1 undef, label %T1, label %F1
+	br i1 true, label %T1, label %F1
 
 T1:
 	ret i32 42
diff --git a/llvm/test/Transforms/JumpThreading/crash.ll b/llvm/test/Transforms/JumpThreading/crash.ll
index 69ef3b9762908..4fab807864db8 100644
--- a/llvm/test/Transforms/JumpThreading/crash.ll
+++ b/llvm/test/Transforms/JumpThreading/crash.ll
@@ -239,7 +239,7 @@ return:
 }
 
 ; PR6119
-define i32 @test8(i32 %action) nounwind {
+define i32 @test8(i32 %action, i1 %arg) nounwind {
 entry:
   switch i32 %action, label %lor.rhs [
     i32 1, label %if.then
@@ -257,14 +257,14 @@ lor.end:                                          ; preds = %lor.rhs, %entry
   br i1 %cmp103, label %for.cond, label %if.then
 
 for.cond:                                         ; preds = %for.body, %lor.end
-  br i1 undef, label %if.then, label %for.body
+  br i1 %arg, label %if.then, label %for.body
 
 for.body:                                         ; preds = %for.cond
   br label %for.cond
 }
 
 ; PR6119
-define i32 @test9(i32 %action) nounwind {
+define i32 @test9(i32 %action, i1 %arg) nounwind {
 entry:
   switch i32 %action, label %lor.rhs [
     i32 1, label %if.then
@@ -283,14 +283,14 @@ lor.end:                                          ; preds = %lor.rhs, %entry
   br i1 %cmp103, label %for.cond, label %if.then
 
 for.cond:                                         ; preds = %for.body, %lor.end
-  br i1 undef, label %if.then, label %for.body
+  br i1 %arg, label %if.then, label %for.body
 
 for.body:                                         ; preds = %for.cond
   br label %for.cond
 }
 
 ; PR6119
-define i32 @test10(i32 %action, i32 %type) nounwind {
+define i32 @test10(i32 %action, i32 %type, i1 %arg) nounwind {
 entry:
   %cmp2 = icmp eq i32 %type, 0                    ; <i1> [#uses=1]
   switch i32 %action, label %lor.rhs [
@@ -311,7 +311,7 @@ lor.end:                                          ; preds = %lor.rhs, %entry
   br i1 %cmp103, label %for.cond, label %if.then
 
 for.cond:                                         ; preds = %for.body, %lor.end
-  br i1 undef, label %if.then, label %for.body
+  br i1 %arg, label %if.then, label %for.body
 
 for.body:                                         ; preds = %for.cond
   br label %for.cond
@@ -391,7 +391,7 @@ if.end:                                           ; preds = %land.end69
 }
 
 ; PR7647
-define void @test15() nounwind {
+define void @test15(i1 %arg) nounwind {
 entry:
   ret void
 
@@ -420,7 +420,7 @@ lbl_709:
   br label %if.end949
 
 for.cond603:                                      ; preds = %for.body607, %if.end336
-  br i1 undef, label %for.cond603, label %if.end949
+  br i1 %arg, label %for.cond603, label %if.end949
 
 if.end949:                                        ; preds = %for.cond603, %lbl_709, %for.cond111
   %l_678.2 = phi i64 [ %l_678.5, %for.cond111 ], [ undef, %lbl_709 ], [ 5, %for.cond603 ] ; <i64> [#uses=1]
@@ -430,7 +430,7 @@ for.body1016:                                     ; preds = %for.cond1012
   br label %for.body1016
 
 for.cond1035:                                     ; preds = %for.inc1055, %if.then1026
-  br i1 undef, label %for.cond1040, label %lbl_664
+  br i1 %arg, label %for.cond1040, label %lbl_664
 
 for.cond1040:                                     ; preds = %for.body1044, %for.cond1035
   ret void
@@ -461,9 +461,9 @@ if.end:
   ret void
 }
 
-define void @test17() {
+define void @test17(i1 %arg) {
 entry:
-  br i1 undef, label %bb269.us.us, label %bb269.us.us.us
+  br i1 %arg, label %bb269.us.us, label %bb269.us.us.us
 
 bb269.us.us.us:
   %indvar = phi i64 [ %indvar.next, %bb287.us.us.us ], [ 0, %entry ]
diff --git a/llvm/test/Transforms/JumpThreading/ddt-crash.ll b/llvm/test/Transforms/JumpThreading/ddt-crash.ll
index 3f191a9d49dbc..b0bba1a2dd0c4 100644
--- a/llvm/test/Transforms/JumpThreading/ddt-crash.ll
+++ b/llvm/test/Transforms/JumpThreading/ddt-crash.ll
@@ -66,7 +66,7 @@ bb11:
   ret void
 }
 
-define void @spam(ptr %arg) {
+define void @spam(ptr %arg, i1 %arg2) {
 ; CHECK-LABEL: @spam(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = load i8, ptr undef, align 8
@@ -87,9 +87,11 @@ define void @spam(ptr %arg) {
 ; CHECK-NEXT:      i8 0, label [[BB11]]
 ; CHECK-NEXT:      i8 1, label [[BB10:%.*]]
 ; CHECK-NEXT:      i8 2, label [[BB10]]
-; CHECK-NEXT:      i8 3, label [[BB8]]
-; CHECK-NEXT:      i8 4, label [[BB8]]
+; CHECK-NEXT:      i8 3, label [[BB7:%.*]]
+; CHECK-NEXT:      i8 4, label [[BB7]]
 ; CHECK-NEXT:    ]
+; CHECK:       bb7:
+; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB8]], label [[BB10]]
 ; CHECK:       bb8:
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq ptr undef, [[ARG:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[BB10]], label [[BB2]]
@@ -175,7 +177,7 @@ bb6:
   br label %bb7
 
 bb7:
-  br i1 undef, label %bb8, label %bb10
+  br i1 %arg2, label %bb8, label %bb10
 
 bb8:
   %tmp9 = icmp eq ptr undef, %arg
diff --git a/llvm/test/Transforms/JumpThreading/ddt-crash4.ll b/llvm/test/Transforms/JumpThreading/ddt-crash4.ll
index 69eec7d5f6d8b..c84b5ffbc9edc 100644
--- a/llvm/test/Transforms/JumpThreading/ddt-crash4.ll
+++ b/llvm/test/Transforms/JumpThreading/ddt-crash4.ll
@@ -3,7 +3,7 @@
 
 @global = external global i64, align 8
 
-define void @f() {
+define void @f(i1 %arg) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
@@ -15,12 +15,28 @@ define void @f() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @global, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[BB23:%.*]], label [[BB23]]
-; CHECK:       bb23:
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr @global, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i64 [[TMP11]], 5
+; CHECK-NEXT:    br i1 [[TMP12]], label [[BB14:%.*]], label [[BB17:%.*]]
+; CHECK:       bb14:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB15:%.*]], label [[BB10:%.*]]
+; CHECK:       bb15:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb17:
 ; CHECK-NEXT:    br label [[BB26:%.*]]
+; CHECK:       bb18:
+; CHECK-NEXT:    br i1 [[ARG]], label [[BB23]], label [[BB14]]
+; CHECK:       bb21:
+; CHECK-NEXT:    br label [[BB26]]
+; CHECK:       bb23:
+; CHECK-NEXT:    br i1 [[ARG]], label [[BB24:%.*]], label [[BB14]]
+; CHECK:       bb24:
+; CHECK-NEXT:    br i1 [[ARG]], label [[BB28:%.*]], label [[BB21:%.*]]
 ; CHECK:       bb26:
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb27:
-; CHECK-NEXT:    br label [[BB26]]
+; CHECK-NEXT:    br label [[BB24]]
 ;
 bb:
   br label %bb1
@@ -52,7 +68,7 @@ bb13:
   br label %bb14
 
 bb14:
-  br i1 undef, label %bb15, label %bb16
+  br i1 %arg, label %bb15, label %bb16
 
 bb15:
   unreachable
@@ -64,10 +80,10 @@ bb17:
   br label %bb18
 
 bb18:
-  br i1 undef, label %bb22, label %bb13
+  br i1 %arg, label %bb22, label %bb13
 
 bb19:
-  br i1 undef, label %bb20, label %bb21
+  br i1 %arg, label %bb20, label %bb21
 
 bb20:
   unreachable
@@ -79,10 +95,10 @@ bb22:
   br label %bb23
 
 bb23:
-  br i1 undef, label %bb24, label %bb13
+  br i1 %arg, label %bb24, label %bb13
 
 bb24:
-  br i1 undef, label %bb26, label %bb25
+  br i1 %arg, label %bb26, label %bb25
 
 bb25:
   br label %bb19
diff --git a/llvm/test/Transforms/JumpThreading/landing-pad.ll b/llvm/test/Transforms/JumpThreading/landing-pad.ll
index 29def94cdda07..9c193cc061b56 100644
--- a/llvm/test/Transforms/JumpThreading/landing-pad.ll
+++ b/llvm/test/Transforms/JumpThreading/landing-pad.ll
@@ -63,7 +63,7 @@ entry:
   ret void
 }
 
-define void @_Z3fn1v() uwtable personality ptr @__gxx_personality_v0 {
+define void @_Z3fn1v(i1 %arg) uwtable personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: @_Z3fn1v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = call noalias ptr @_Znwm()
@@ -93,6 +93,8 @@ define void @_Z3fn1v() uwtable personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_REF_I_I_I]], align 4
 ; CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp eq i32 [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_I_I_I]], label [[_ZN1BI1DED1EV_EXIT:%.*]], label [[DELETE_NOTNULL_I_I_I:%.*]]
+; CHECK:       if.then.i.i.i:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[_ZN1BI1DED1EV_EXIT]], label [[DELETE_NOTNULL_I_I_I1:%.*]]
 ; CHECK:       delete.notnull.i.i.i:
 ; CHECK-NEXT:    call void @_ZdlPv()
 ; CHECK-NEXT:    unreachable
@@ -138,7 +140,7 @@ lpad1:                                            ; preds = %_ZN1DC1Ev.exit, %_Z
   br i1 %tobool.i.i.i, label %_ZN1BI1DED1Ev.exit, label %if.then.i.i.i
 
 if.then.i.i.i:                                    ; preds = %lpad1
-  br i1 undef, label %_ZN1BI1DED1Ev.exit, label %delete.notnull.i.i.i
+  br i1 %arg, label %_ZN1BI1DED1Ev.exit, label %delete.notnull.i.i.i
 
 delete.notnull.i.i.i:                             ; preds = %if.then.i.i.i
   call void @_ZdlPv() #9
@@ -191,7 +193,7 @@ entry:
 
 declare void @_ZN1D16deleteKeyPressedEv()
 
-define void @_ZN1BI1DED1Ev(ptr nocapture readonly %this) unnamed_addr uwtable align 2 {
+define void @_ZN1BI1DED1Ev(ptr nocapture readonly %this, i1 %arg) unnamed_addr uwtable align 2 {
 ; CHECK-LABEL: @_ZN1BI1DED1Ev(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS:%.*]], align 8
@@ -199,6 +201,8 @@ define void @_ZN1BI1DED1Ev(ptr nocapture readonly %this) unnamed_addr uwtable al
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[_REF_I_I]], align 4
 ; CHECK-NEXT:    [[TOBOOL_I_I:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_I_I]], label [[_ZN1BI1DED2EV_EXIT:%.*]], label [[DELETE_NOTNULL_I_I:%.*]]
+; CHECK:       if.then.i.i:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[_ZN1BI1DED2EV_EXIT]], label [[DELETE_NOTNULL_I_I1:%.*]]
 ; CHECK:       delete.notnull.i.i:
 ; CHECK-NEXT:    call void @_ZdlPv()
 ; CHECK-NEXT:    unreachable
@@ -213,7 +217,7 @@ entry:
   br i1 %tobool.i.i, label %_ZN1BI1DED2Ev.exit, label %if.then.i.i
 
 if.then.i.i:                                      ; preds = %entry
-  br i1 undef, label %_ZN1BI1DED2Ev.exit, label %delete.notnull.i.i
+  br i1 %arg, label %_ZN1BI1DED2Ev.exit, label %delete.notnull.i.i
 
 delete.notnull.i.i:                               ; preds = %if.then.i.i
   call void @_ZdlPv() #9
@@ -225,7 +229,7 @@ _ZN1BI1DED2Ev.exit:                               ; preds = %entry, %if.then.i.i
 
 declare hidden void @__clang_call_terminate()
 
-define void @_ZN1BI1DED2Ev(ptr nocapture readonly %this) unnamed_addr uwtable align 2 {
+define void @_ZN1BI1DED2Ev(ptr nocapture readonly %this, i1 %arg) unnamed_addr uwtable align 2 {
 ; CHECK-LABEL: @_ZN1BI1DED2Ev(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS:%.*]], align 8
@@ -233,6 +237,8 @@ define void @_ZN1BI1DED2Ev(ptr nocapture readonly %this) unnamed_addr uwtable al
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[_REF_I]], align 4
 ; CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_I]], label [[_ZN1AI1CE5DEREFEV_EXIT:%.*]], label [[DELETE_NOTNULL_I:%.*]]
+; CHECK:       if.then.i:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[_ZN1AI1CE5DEREFEV_EXIT]], label [[DELETE_NOTNULL_I1:%.*]]
 ; CHECK:       delete.notnull.i:
 ; CHECK-NEXT:    call void @_ZdlPv()
 ; CHECK-NEXT:    unreachable
@@ -247,7 +253,7 @@ entry:
   br i1 %tobool.i, label %_ZN1AI1CE5derefEv.exit, label %if.then.i
 
 if.then.i:                                        ; preds = %entry
-  br i1 undef, label %_ZN1AI1CE5derefEv.exit, label %delete.notnull.i
+  br i1 %arg, label %_ZN1AI1CE5derefEv.exit, label %delete.notnull.i
 
 delete.notnull.i:                                 ; preds = %if.then.i
   call void @_ZdlPv() #9
@@ -257,12 +263,14 @@ _ZN1AI1CE5derefEv.exit:                           ; preds = %entry, %if.then.i
   ret void
 }
 
-define void @_ZN1AI1CE5derefEv(ptr nocapture readonly %this) nounwind uwtable align 2 {
+define void @_ZN1AI1CE5derefEv(ptr nocapture readonly %this, i1 %arg) nounwind uwtable align 2 {
 ; CHECK-LABEL: @_ZN1AI1CE5derefEv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[THIS:%.*]], align 4
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[DELETE_NOTNULL:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_END]], label [[DELETE_NOTNULL1:%.*]]
 ; CHECK:       delete.notnull:
 ; CHECK-NEXT:    call void @_ZdlPv()
 ; CHECK-NEXT:    unreachable
@@ -275,7 +283,7 @@ entry:
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
-  br i1 undef, label %if.end, label %delete.notnull
+  br i1 %arg, label %if.end, label %delete.notnull
 
 delete.notnull:                                   ; preds = %if.then
   call void @_ZdlPv() #9
diff --git a/llvm/test/Transforms/JumpThreading/pr22086.ll b/llvm/test/Transforms/JumpThreading/pr22086.ll
index c7f9fcdbd3462..44ab88bb3ea5c 100644
--- a/llvm/test/Transforms/JumpThreading/pr22086.ll
+++ b/llvm/test/Transforms/JumpThreading/pr22086.ll
@@ -3,12 +3,15 @@
 
 
 
-define void @f() {
-; CHECK-LABEL: define void @f() {
+define void @f(i1 %arg) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: i1 [[ARG:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br label %[[LOR_RHS:.*]]
+; CHECK-NEXT:    br label %[[FOR_COND1:.*]]
+; CHECK:       [[FOR_COND1]]:
+; CHECK-NEXT:    br i1 [[ARG]], label %[[LOR_RHS:.*]], label %[[LOR_RHS]]
 ; CHECK:       [[LOR_RHS]]:
-; CHECK-NEXT:    br label %[[LOR_RHS]]
+; CHECK-NEXT:    br label %[[FOR_COND1]]
 ;
 entry:
   br label %for.cond1
@@ -23,7 +26,7 @@ if.end16:
 
 for.cond1:
   %g.1 = phi i32 [ 0, %entry ], [ 0, %lor.rhs ], [ %g.3, %if.end16 ]
-  br i1 undef, label %lor.rhs, label %if.end16
+  br i1 %arg, label %lor.rhs, label %if.end16
 
 lor.rhs:
   br label %for.cond1
diff --git a/llvm/test/Transforms/JumpThreading/pr9331.ll b/llvm/test/Transforms/JumpThreading/pr9331.ll
index 572b1e558e059..06c0a42bd849a 100644
--- a/llvm/test/Transforms/JumpThreading/pr9331.ll
+++ b/llvm/test/Transforms/JumpThreading/pr9331.ll
@@ -13,17 +13,16 @@ entry:
 
 for.cond2:                                        ; preds = %for.inc46, %lor.end, %entry
   %p_44.addr.1 = phi i8 [ %p_44.addr.1, %lor.end ], [ %p_44, %entry ], [ %p_44.addr.1, %for.inc46 ]
-  br i1 undef, label %for.inc46, label %for.body5
+  br i1 true, label %for.inc46, label %for.body5
 
 for.body5:                                        ; preds = %for.cond2
-  br i1 undef, label %lbl_465, label %if.then9
+  br i1 false, label %lbl_465, label %if.then9
 
 if.then9:                                         ; preds = %for.body5
   br label %return
 
 lbl_465:                                          ; preds = %lbl_465, %for.body5
-  %tobool19 = icmp eq i8 undef, 0
-  br i1 %tobool19, label %if.end21, label %lbl_465
+  br i1 true, label %if.end21, label %lbl_465
 
 if.end21:                                         ; preds = %lbl_465
   %conv23 = zext i8 %p_44.addr.1 to i64
diff --git a/llvm/test/Transforms/JumpThreading/preserving-debugloc-br.ll b/llvm/test/Transforms/JumpThreading/preserving-debugloc-br.ll
index ca67f0dec31ba..521c697c71208 100644
--- a/llvm/test/Transforms/JumpThreading/preserving-debugloc-br.ll
+++ b/llvm/test/Transforms/JumpThreading/preserving-debugloc-br.ll
@@ -24,11 +24,10 @@ lor.rhs:                                          ; preds = %entry
   br label %lor.end, !dbg !10
 
 lor.end:                                          ; preds = %lor.rhs, %entry
-  %cmp103 = xor i1 undef, undef, !dbg !11
-  br i1 %cmp103, label %for.cond, label %if.then, !dbg !12
+  br i1 false, label %for.cond, label %if.then, !dbg !12
 
 for.cond:                                         ; preds = %for.body, %lor.end
-  br i1 undef, label %if.then, label %for.body, !dbg !13
+  br i1 false, label %if.then, label %for.body, !dbg !13
 
 for.body:                                         ; preds = %for.cond
   br label %for.cond, !dbg !14
diff --git a/llvm/test/Transforms/JumpThreading/thread-loads.ll b/llvm/test/Transforms/JumpThreading/thread-loads.ll
index 85952e8e6db4f..6f19b3d17ff1d 100644
--- a/llvm/test/Transforms/JumpThreading/thread-loads.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-loads.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart
 ; RUN: opt < %s -passes=jump-threading -S | FileCheck %s
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -S | FileCheck %s
 
@@ -316,13 +316,12 @@ bb3:
   ret i32 %res.0
 }
 
-; Make sure we merge the aliasing metadata. We keep the range metadata for the
-; first load, as it dominates the second load. Hence we can eliminate the
-; branch.
+; We keep the tbaa and range metadata for the first load, as it dominates the
+; second load. Hence we can eliminate the branch.
 define void @test8(ptr, ptr, ptr) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  ret2:
-; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0:%.*]], align 4, !range [[RNG4:![0-9]+]], !noundef !5
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]], !range [[RNG4:![0-9]+]], !noundef [[META5:![0-9]+]]
 ; CHECK-NEXT:    store i32 [[A]], ptr [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    [[XXX:%.*]] = tail call i32 (...) @f1() #[[ATTR0]]
 ; CHECK-NEXT:    ret void
@@ -455,8 +454,8 @@ define fastcc i32 @Search(i64 %idxprom.i, i64 %idxprom.i89, i32 %c) {
 ; CHECK-NEXT:    [[ARRAYIDX89:%.*]] = getelementptr inbounds [65 x ptr], ptr @last, i64 0, i64 [[IDXPROM_I]]
 ; CHECK-NEXT:    [[PHASE:%.*]] = getelementptr inbounds [65 x %struct.NEXT_MOVE], ptr @next_status, i64 0, i64 [[IDXPROM_I]], i32 0
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[CLEANUP:%.*]] [
-; CHECK-NEXT:    i32 1, label [[SW_BB_I:%.*]]
-; CHECK-NEXT:    i32 0, label [[SW_BB21_I:%.*]]
+; CHECK-NEXT:      i32 1, label [[SW_BB_I:%.*]]
+; CHECK-NEXT:      i32 0, label [[SW_BB21_I:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb.i:
 ; CHECK-NEXT:    [[CALL_I62:%.*]] = call fastcc ptr @GenerateCheckEvasions()
@@ -680,7 +679,6 @@ right_x:
 }
 
 
-; CHECK: [[RNG4]] = !{i32 0, i32 1}
 
 !0 = !{!3, !3, i64 0}
 !1 = !{!"omnipotent char", !2}
@@ -694,3 +692,11 @@ right_x:
 !9 = !{!7}
 !10 = !{!8}
 !11 = !{}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[RNG4]] = !{i32 0, i32 1}
+; CHECK: [[META5]] = !{}
+;.
diff --git a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
index 7b0dc4ad3ae78..d8bd3f389aae8 100644
--- a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
+++ b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
@@ -94,7 +94,7 @@ declare i32 @a()
 ; This gets into a state that could cause instruction simplify
 ; to hang - an insertelement instruction has itself as an operand.
 
-define void @PR48362() {
+define void @PR48362(i1 %arg) {
 ;
 ; CHECK-LABEL: @PR48362(
 ; CHECK-NEXT:  cleanup.cont1500:
@@ -142,7 +142,7 @@ for.body1911:                                     ; preds = %if.else1904
   br label %land.end2173
 
 land.end2173:                                     ; preds = %for.body1911
-  br i1 undef, label %if.end2178, label %cleanup2297
+  br i1 %arg, label %if.end2178, label %cleanup2297
 
 if.end2178:                                       ; preds = %land.end2173
   %l_580.sroa.0.2.vec.insert = insertelement <4 x i32> %l_580.sroa.0.0, i32 undef, i32 1
@@ -165,7 +165,7 @@ cleanup.cont2339:                                 ; preds = %cleanup2329
   br label %for.inc2340
 
 for.inc2340:                                      ; preds = %cleanup.cont2339
-  br i1 undef, label %for.body1509, label %crit_edge115
+  br i1 %arg, label %for.body1509, label %crit_edge115
 
 crit_edge114:                                     ; preds = %cleanup2329
   unreachable
diff --git a/llvm/test/Transforms/LCSSA/indirectbr.ll b/llvm/test/Transforms/LCSSA/indirectbr.ll
index 7ddfb6bfb223d..a71696c091a69 100644
--- a/llvm/test/Transforms/LCSSA/indirectbr.ll
+++ b/llvm/test/Transforms/LCSSA/indirectbr.ll
@@ -4,25 +4,25 @@
 ; the loop, and the loop has exits with predecessors not within the loop
 ; (and btw these edges are unsplittable due to the indirectbr).
 ; PR5437
-define i32 @test0() nounwind {
+define i32 @test0(i1 %arg) nounwind {
 ; CHECK-LABEL: @test0
 entry:
-  br i1 undef, label %"4", label %"3"
+  br i1 %arg, label %"4", label %"3"
 
 "3":                                              ; preds = %entry
   ret i32 0
 
 "4":                                              ; preds = %entry
-  br i1 undef, label %"6", label %"5"
+  br i1 %arg, label %"6", label %"5"
 
 "5":                                              ; preds = %"4"
   unreachable
 
 "6":                                              ; preds = %"4"
-  br i1 undef, label %"10", label %"13"
+  br i1 %arg, label %"10", label %"13"
 
 "10":                                             ; preds = %"6"
-  br i1 undef, label %"22", label %"15"
+  br i1 %arg, label %"22", label %"15"
 
 "13":                                             ; preds = %"6"
   unreachable
@@ -393,7 +393,7 @@ entry:
   unreachable
 
 "1566":                                           ; preds = %"23"
-  br i1 undef, label %"1569", label %"1568"
+  br i1 %arg, label %"1569", label %"1568"
 
 "1568":                                           ; preds = %"1566"
   unreachable
@@ -545,13 +545,13 @@ entry:
 ; create PHIs in one of such exits we are also inserting PHIs in L2 header. This
 ; could break LCSSA form for L2 because these inserted PHIs can also have uses
 ; in L2 exits. Test that we don't assert/crash on that.
-define void @test1() {
+define void @test1(i1 %arg) {
 ; CHECK-LABEL: @test1
   br label %lab1
 
 lab1:
   %tmp21 = add i32 undef, 677038203
-  br i1 undef, label %lab2, label %exit
+  br i1 %arg, label %lab2, label %exit
 
 lab2:
   indirectbr ptr undef, [label %lab1, label %lab3]
@@ -559,7 +559,7 @@ lab2:
 lab3:
 ; CHECK: %tmp21.lcssa1 = phi i32 [ %tmp21.lcssa1, %lab4 ], [ %tmp21, %lab2 ]
   %tmp12 = phi i32 [ %tmp21, %lab2 ], [ %tmp12, %lab4 ]
-  br i1 undef, label %lab5, label %lab4
+  br i1 %arg, label %lab5, label %lab4
 
 lab4:
   br label %lab3
diff --git a/llvm/test/Transforms/LCSSA/invoke-dest.ll b/llvm/test/Transforms/LCSSA/invoke-dest.ll
index 88f5688e14af1..ea7e5f31ec0ae 100644
--- a/llvm/test/Transforms/LCSSA/invoke-dest.ll
+++ b/llvm/test/Transforms/LCSSA/invoke-dest.ll
@@ -9,9 +9,9 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 @.str32190 = external constant [92 x i8], align 1		; <ptr> [#uses=1]
 @.str41 = external constant [25 x i8], align 1		; <ptr> [#uses=1]
 
-define void @_ZN8EtherBus10initializeEv() personality ptr @__gxx_personality_v0 {
+define void @_ZN8EtherBus10initializeEv(i1 %arg) personality ptr @__gxx_personality_v0 {
 entry:
-	br i1 undef, label %_ZN7cObjectnwEj.exit, label %bb.i
+	br i1 %arg, label %_ZN7cObjectnwEj.exit, label %bb.i
 
 bb.i:		; preds = %entry
 	br label %_ZN7cObjectnwEj.exit
@@ -21,19 +21,19 @@ _ZN7cObjectnwEj.exit:		; preds = %bb.i, %entry
 			to label %bb1 unwind label %lpad
 
 bb1:		; preds = %_ZN7cObjectnwEj.exit
-	br i1 undef, label %_ZNK5cGate4sizeEv.exit, label %bb.i110
+	br i1 %arg, label %_ZNK5cGate4sizeEv.exit, label %bb.i110
 
 bb.i110:		; preds = %bb1
 	br label %_ZNK5cGate4sizeEv.exit
 
 _ZNK5cGate4sizeEv.exit:		; preds = %bb.i110, %bb1
-	br i1 undef, label %_ZNK5cGate4sizeEv.exit122, label %bb.i120
+	br i1 %arg, label %_ZNK5cGate4sizeEv.exit122, label %bb.i120
 
 bb.i120:		; preds = %_ZNK5cGate4sizeEv.exit
 	br label %_ZNK5cGate4sizeEv.exit122
 
 _ZNK5cGate4sizeEv.exit122:		; preds = %bb.i120, %_ZNK5cGate4sizeEv.exit
-	br i1 undef, label %bb8, label %bb2
+	br i1 %arg, label %bb8, label %bb2
 
 bb2:		; preds = %_ZNK5cGate4sizeEv.exit122
 	unreachable
@@ -51,7 +51,7 @@ invcont10:		; preds = %invcont9
 			to label %invcont11 unwind label %lpad119
 
 invcont11:		; preds = %invcont10
-	br i1 undef, label %bb12, label %bb18
+	br i1 %arg, label %bb12, label %bb18
 
 bb12:		; preds = %invcont11
 	invoke void (ptr, ptr, ...) @_ZN6cEnvir6printfEPKcz(ptr null, ptr @.str12, i32 undef)
@@ -61,10 +61,10 @@ bb.i.i159:		; preds = %bb12
 	unreachable
 
 bb18:		; preds = %invcont11
-	br i1 undef, label %bb32, label %bb34
+	br i1 %arg, label %bb32, label %bb34
 
 bb32:		; preds = %bb18
-	br i1 undef, label %bb.i.i123, label %bb34
+	br i1 %arg, label %bb.i.i123, label %bb34
 
 bb.i.i123:		; preds = %bb32
 	br label %bb34
@@ -74,7 +74,7 @@ bb34:		; preds = %bb.i.i123, %bb32, %bb18
 			to label %invcont35 unwind label %lpad119		; <ptr> [#uses=0]
 
 invcont35:		; preds = %bb34
-	br i1 undef, label %bb49, label %bb61
+	br i1 %arg, label %bb49, label %bb61
 
 bb49:		; preds = %invcont35
 	invoke void (ptr, ptr, ...) @_ZNK13cSimpleModule5errorEPKcz(ptr undef, ptr @.str32190)
diff --git a/llvm/test/Transforms/LCSSA/pr28424.ll b/llvm/test/Transforms/LCSSA/pr28424.ll
index 42d8de19e9148..3f247a2023722 100644
--- a/llvm/test/Transforms/LCSSA/pr28424.ll
+++ b/llvm/test/Transforms/LCSSA/pr28424.ll
@@ -8,19 +8,19 @@ target triple = "x86_64-unknown-linux-gnu"
 ; it.
 
 ; CHECK-LABEL: @foo1
-define internal i32 @foo1() {
+define internal i32 @foo1(i1 %arg) {
 entry:
   br label %header
 
 header:
   %x = add i32 0, 1
-  br i1 undef, label %if, label %loopexit1
+  br i1 %arg, label %if, label %loopexit1
 
 if:
-  br i1 undef, label %latch, label %loopexit2
+  br i1 %arg, label %latch, label %loopexit2
 
 latch:
-  br i1 undef, label %header, label %loopexit3
+  br i1 %arg, label %header, label %loopexit3
 
 ; CHECK: loopexit1:
 ; CHECK:   %x.lcssa = phi i32 [ %x, %header ]
@@ -40,7 +40,7 @@ loopexit3:
 ; CHECK: loop_with_insert_point:
 ; CHECK:   %x4 = phi i32 [ %x4, %loop_with_insert_point ], [ %x.lcssa2, %loopexit3 ], [ %x.lcssa, %loopexit1 ]
 loop_with_insert_point:
-  br i1 undef, label %loop_with_insert_point, label %bb
+  br i1 %arg, label %loop_with_insert_point, label %bb
 
 ; CHECK: bb:
 ; CHECK:   %x4.lcssa = phi i32 [ %x4, %loop_with_insert_point ]
@@ -54,16 +54,16 @@ exit:
 }
 
 ; CHECK-LABEL: @foo2
-define internal i32 @foo2() {
+define internal i32 @foo2(i1 %arg) {
 entry:
   br label %header
 
 header:
   %x = add i32 0, 1
-  br i1 undef, label %latch, label %loopexit1
+  br i1 %arg, label %latch, label %loopexit1
 
 latch:
-  br i1 undef, label %header, label %loopexit2
+  br i1 %arg, label %header, label %loopexit2
 
 ; CHECK: loopexit1:
 ; CHECK:   %x.lcssa = phi i32 [ %x, %header ]
@@ -78,7 +78,7 @@ loopexit2:
 ; CHECK: loop_with_insert_point:
 ; CHECK:   %x2 = phi i32 [ %x2, %loop_with_insert_point ], [ %x.lcssa1, %loopexit2 ], [ %x.lcssa, %loopexit1 ]
 loop_with_insert_point:
-  br i1 undef, label %loop_with_insert_point, label %exit
+  br i1 %arg, label %loop_with_insert_point, label %exit
 
 ; CHECK: exit:
 ; CHECK:   %x2.lcssa = phi i32 [ %x2, %loop_with_insert_point ]
diff --git a/llvm/test/Transforms/LCSSA/pr28608.ll b/llvm/test/Transforms/LCSSA/pr28608.ll
index 0b0fb664092d0..b522bea806d22 100644
--- a/llvm/test/Transforms/LCSSA/pr28608.ll
+++ b/llvm/test/Transforms/LCSSA/pr28608.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; PR28608
 ; Check that we don't crash on this test.
 
-define void @foo() {
+define void @foo(i1 %arg) {
 entry:
   br label %bb1
 
@@ -14,10 +14,10 @@ bb1:
 
 bb2:
   %x = phi i32 [ undef, %bb5 ], [ undef, %bb1 ]
-  br i1 undef, label %bb3, label %bb6
+  br i1 %arg, label %bb3, label %bb6
 
 bb3:
-  br i1 undef, label %bb5, label %bb4
+  br i1 %arg, label %bb5, label %bb4
 
 bb4:
   br label %bb6
diff --git a/llvm/test/Transforms/LCSSA/remove-phis.ll b/llvm/test/Transforms/LCSSA/remove-phis.ll
index 67e9a035cad79..8b669a29c2de9 100644
--- a/llvm/test/Transforms/LCSSA/remove-phis.ll
+++ b/llvm/test/Transforms/LCSSA/remove-phis.ll
@@ -3,37 +3,37 @@
 ; This bugpoint reduced test case used to assert when removing unused PHI nodes.
 ; Just verify that we do not assert/crash.
 
-define void @test() {
+define void @test(i1 %arg) {
 entry:
   br label %gazank
 
 gazank:
   %value = phi i16 [ 0, %entry ], [ undef, %gazonk ]
-  br i1 undef, label %gazink, label %qqq
+  br i1 %arg, label %gazink, label %qqq
 
 gazink:
-  br i1 undef, label %gazonk, label %infinite.loop.pred
+  br i1 %arg, label %gazonk, label %infinite.loop.pred
 
 gazonk:
-  br i1 undef, label %exit1, label %gazank
+  br i1 %arg, label %exit1, label %gazank
 
 qqq:
-  br i1 undef, label %www, label %exit2
+  br i1 %arg, label %www, label %exit2
 
 www:
-  br i1 undef, label %qqq, label %foo.pred
+  br i1 %arg, label %qqq, label %foo.pred
 
 foo.pred:
   br label %foo
 
 foo:
-  br i1 undef, label %bar, label %exit1.pred
+  br i1 %arg, label %bar, label %exit1.pred
 
 bar:
-  br i1 undef, label %foo, label %exit2.pred
+  br i1 %arg, label %foo, label %exit2.pred
 
 unreachable1:
-  br i1 undef, label %foo, label %exit2.pred
+  br i1 %arg, label %foo, label %exit2.pred
 
 exit1.pred:
   br label %exit1
diff --git a/llvm/test/Transforms/LCSSA/unused-phis.ll b/llvm/test/Transforms/LCSSA/unused-phis.ll
index 1a5f27e550315..baead279c86f9 100644
--- a/llvm/test/Transforms/LCSSA/unused-phis.ll
+++ b/llvm/test/Transforms/LCSSA/unused-phis.ll
@@ -12,19 +12,19 @@
 
 declare void @printf(i32 %i)
 
-define i32 @unused_phis() nounwind {
+define i32 @unused_phis(i1 %arg) nounwind {
 entry:
   br label %loop
 
 loop:
   %i = phi i32 [0, %entry], [1, %then2]
-  br i1 undef, label %exit1, label %then1
+  br i1 %arg, label %exit1, label %then1
 
 then1:
-  br i1 undef, label %exit2, label %then2
+  br i1 %arg, label %exit2, label %then2
 
 then2:
-  br i1 undef, label %exit3, label %loop
+  br i1 %arg, label %exit3, label %loop
 
 exit1:
   call void @printf(i32 %i)
diff --git a/llvm/test/Transforms/LICM/2009-12-10-LICM-Indbr-Crash.ll b/llvm/test/Transforms/LICM/2009-12-10-LICM-Indbr-Crash.ll
index 9fded3277fdb4..10b2d7331b5b4 100644
--- a/llvm/test/Transforms/LICM/2009-12-10-LICM-Indbr-Crash.ll
+++ b/llvm/test/Transforms/LICM/2009-12-10-LICM-Indbr-Crash.ll
@@ -1,18 +1,18 @@
 ; Test for rdar://7452967
 ; RUN: opt < %s -passes=licm -disable-output
-define void @foo (ptr %v)
+define void @foo (ptr %arg)
 {
   entry:
-    br i1 undef, label %preheader, label %return
+    br i1 false, label %preheader, label %return
 
   preheader:
-    br i1 undef, label %loop, label %return
+    br i1 false, label %loop, label %return
 
   loop:
-    indirectbr ptr undef, [label %preheader, label %stuff]
+    indirectbr ptr %arg, [label %preheader, label %stuff]
 
   stuff:
-    %0 = load i8, ptr undef, align 1
+    %0 = load i8, ptr %arg, align 1
     br label %loop
 
   return:
diff --git a/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll b/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll
new file mode 100644
index 0000000000000..a040c3cc6947c
--- /dev/null
+++ b/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>,licm)' -verify-memoryssa -S < %s | FileCheck %s
+
+; Check that running LICM after SimpleLoopUnswitch does not result in a crash.
+
+define i32 @foo(i1 %arg, ptr %arg1) {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: i1 [[ARG:%.*]], ptr [[ARG1:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[ARG_FR:%.*]] = freeze i1 [[ARG]]
+; CHECK-NEXT:    br i1 [[ARG_FR]], label %[[START_SPLIT_US:.*]], label %[[START_SPLIT:.*]]
+; CHECK:       [[START_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    br label %[[BB0:.*]]
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[UNSWITCHED_SELECT_US:%.*]] = phi ptr [ [[ARG1]], %[[BB0]] ]
+; CHECK-NEXT:    [[I3_US:%.*]] = call i32 [[UNSWITCHED_SELECT_US]]()
+; CHECK-NEXT:    br i1 true, label %[[LOOP_US]], label %[[RET_SPLIT_US:.*]]
+; CHECK:       [[RET_SPLIT_US]]:
+; CHECK-NEXT:    [[I3_LCSSA_US:%.*]] = phi i32 [ [[I3_US]], %[[BB1]] ]
+; CHECK-NEXT:    br label %[[RET:.*]]
+; CHECK:       [[START_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    br i1 false, label %[[LOOP]], label %[[RET_SPLIT:.*]]
+; CHECK:       [[RET_SPLIT]]:
+; CHECK-NEXT:    [[I3_LE:%.*]] = call i32 @bar()
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[I3_LE]], %[[RET_SPLIT]] ], [ [[I3_LCSSA_US]], %[[RET_SPLIT_US]] ]
+; CHECK-NEXT:    ret i32 [[DOTUS_PHI]]
+;
+start:
+  br label %loop
+
+loop:                                              ; preds = %loop, %bb
+  %i = select i1 %arg, ptr %arg1, ptr @bar
+  %i3 = call i32 %i()
+  br i1 %arg, label %loop, label %ret
+
+ret:                                              ; preds = %loop
+  ret i32 %i3
+}
+
+declare i32 @bar() nounwind willreturn memory(none)
diff --git a/llvm/test/Transforms/LICM/alias-set-tracker-loss.ll b/llvm/test/Transforms/LICM/alias-set-tracker-loss.ll
index ff8d758d05ae6..1670d865e89dd 100644
--- a/llvm/test/Transforms/LICM/alias-set-tracker-loss.ll
+++ b/llvm/test/Transforms/LICM/alias-set-tracker-loss.ll
@@ -6,7 +6,7 @@
 ; removed, its alias set tracker is destroyed and no longer available when LICM
 ; runs on the outer loop.
 
-define void @f() {
+define void @f(i1 %arg) {
 entry:
   br label %l1
 
@@ -17,7 +17,7 @@ l1.loopexit:
   br label %l1.backedge
 
 l1:
-  br i1 undef, label %l1.backedge, label %l2.preheader
+  br i1 %arg, label %l1.backedge, label %l2.preheader
 
 l1.backedge:
   br label %l1
diff --git a/llvm/test/Transforms/LICM/assume.ll b/llvm/test/Transforms/LICM/assume.ll
index bfb0359dbdff0..0a00f1ef7def5 100644
--- a/llvm/test/Transforms/LICM/assume.ll
+++ b/llvm/test/Transforms/LICM/assume.ll
@@ -1,19 +1,19 @@
 ; RUN: opt -passes=licm < %s -S | FileCheck %s
 ; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<target-ir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
-define void @f_0(i1 %p) nounwind ssp {
+define void @f_0(i1 %p, i1 %arg) nounwind ssp {
 ; CHECK-LABEL: @f_0(
 entry:
   br label %for.body
 
 for.body:
-  br i1 undef, label %if.then, label %for.cond.backedge
+  br i1 %arg, label %if.then, label %for.cond.backedge
 
 for.cond.backedge:
-  br i1 undef, label %for.end104, label %for.body
+  br i1 %arg, label %for.end104, label %for.body
 
 if.then:
-  br i1 undef, label %if.then27, label %if.end.if.end.split_crit_edge.critedge
+  br i1 %arg, label %if.then27, label %if.end.if.end.split_crit_edge.critedge
 
 if.then27:
 ; CHECK: tail call void @llvm.assume
@@ -24,10 +24,10 @@ if.end.if.end.split_crit_edge.critedge:
   br label %for.body61
 
 for.body61.us:
-  br i1 undef, label %for.cond.backedge, label %for.body61.us
+  br i1 %arg, label %for.cond.backedge, label %for.body61.us
 
 for.body61:
-  br i1 undef, label %for.cond.backedge, label %for.body61
+  br i1 %arg, label %for.cond.backedge, label %for.body61
 
 for.end104:
   ret void
diff --git a/llvm/test/Transforms/LICM/callbr-crash.ll b/llvm/test/Transforms/LICM/callbr-crash.ll
index 60f7f6c7dd768..1bc60e575663b 100644
--- a/llvm/test/Transforms/LICM/callbr-crash.ll
+++ b/llvm/test/Transforms/LICM/callbr-crash.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=licm < %s | FileCheck %s
 
-define i32 @j() {
+define i32 @j(i1 %arg) {
 ; CHECK-LABEL: @j(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
@@ -9,7 +9,7 @@ define i32 @j() {
 ; CHECK-NEXT:    callbr void asm sideeffect "", "!i,~{dirflag},~{fpsr},~{flags}"()
 ; CHECK-NEXT:    to label [[COND_TRUE_I:%.*]] [label %for.end.split.loop.exit1]
 ; CHECK:       cond.true.i:
-; CHECK-NEXT:    br i1 true, label [[FOR_END_SPLIT_LOOP_EXIT:%.*]], label [[FOR_COND]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_END_SPLIT_LOOP_EXIT:%.*]], label [[FOR_COND]]
 ; CHECK:       for.end.split.loop.exit:
 ; CHECK-NEXT:    [[ASMRESULT1_I_I_LE:%.*]] = extractvalue { i8, i32 } zeroinitializer, 1
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
@@ -29,7 +29,7 @@ for.cond:                                         ; preds = %cond.true.i, %entry
 
 cond.true.i:                                      ; preds = %for.cond
   %asmresult1.i.i = extractvalue { i8, i32 } zeroinitializer, 1
-  br i1 undef, label %for.end, label %for.cond
+  br i1 %arg, label %for.end, label %for.cond
 
 for.end:                                          ; preds = %cond.true.i, %for.cond
   %phi = phi i32 [ %asmresult1.i.i, %cond.true.i ], [ undef, %for.cond ]
diff --git a/llvm/test/Transforms/LICM/crash.ll b/llvm/test/Transforms/LICM/crash.ll
index 2cb44daf8e24f..03fc39b77a5a5 100644
--- a/llvm/test/Transforms/LICM/crash.ll
+++ b/llvm/test/Transforms/LICM/crash.ll
@@ -42,10 +42,10 @@ for.body:                                         ; preds = %for.body, %entry
 }
 
 ; PR8102
-define void @test3() {
+define void @test3(i1 %arg) {
 entry:
   %__first = alloca { ptr }
-  br i1 undef, label %for.cond, label %for.end
+  br i1 %arg, label %for.cond, label %for.end
 
 for.cond:                                         ; preds = %for.cond, %entry
   %tmp2 = load ptr, ptr %__first, align 4
diff --git a/llvm/test/Transforms/LICM/debug-value.ll b/llvm/test/Transforms/LICM/debug-value.ll
index 17ad57f58898d..cbbfbe6c3beb8 100644
--- a/llvm/test/Transforms/LICM/debug-value.ll
+++ b/llvm/test/Transforms/LICM/debug-value.ll
@@ -3,18 +3,18 @@
 
 ; RUN: opt -passes=licm < %s -S --try-experimental-debuginfo-iterators | FileCheck %s
 
-define void @dgefa() nounwind ssp {
+define void @dgefa(i1 %arg) nounwind ssp {
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.cond.backedge, %entry
-  br i1 undef, label %if.then, label %for.cond.backedge, !dbg !11
+  br i1 %arg, label %if.then, label %for.cond.backedge, !dbg !11
 
 for.cond.backedge:                                ; preds = %for.body61, %for.body61.us, %for.body
-  br i1 undef, label %for.end104, label %for.body, !dbg !15
+  br i1 %arg, label %for.end104, label %for.body, !dbg !15
 
 if.then:                                          ; preds = %for.body
-  br i1 undef, label %if.then27, label %if.end.if.end.split_crit_edge.critedge, !dbg !16
+  br i1 %arg, label %if.then27, label %if.end.if.end.split_crit_edge.critedge, !dbg !16
 
 if.then27:                                        ; preds = %if.then
 ; CHECK: #dbg_value
@@ -25,10 +25,10 @@ if.end.if.end.split_crit_edge.critedge:           ; preds = %if.then
   br label %for.body61
 
 for.body61.us:                                    ; preds = %for.body61.us, %if.then27
-  br i1 undef, label %for.cond.backedge, label %for.body61.us, !dbg !23
+  br i1 %arg, label %for.cond.backedge, label %for.body61.us, !dbg !23
 
 for.body61:                                       ; preds = %for.body61, %if.end.if.end.split_crit_edge.critedge
-  br i1 undef, label %for.cond.backedge, label %for.body61, !dbg !23
+  br i1 %arg, label %for.cond.backedge, label %for.body61, !dbg !23
 
 for.end104:                                       ; preds = %for.cond.backedge
   ret void, !dbg !24
diff --git a/llvm/test/Transforms/LICM/gc-relocate.ll b/llvm/test/Transforms/LICM/gc-relocate.ll
index 2149e74b7ab0d..69b16a9b94d4d 100644
--- a/llvm/test/Transforms/LICM/gc-relocate.ll
+++ b/llvm/test/Transforms/LICM/gc-relocate.ll
@@ -4,15 +4,15 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
 target triple = "x86_64-unknown-linux-gnu"
 
-define ptr addrspace(1) @test(ptr addrspace(1) %arg) #0 gc "statepoint-example" personality ptr @wobble {
+define ptr addrspace(1) @test(ptr addrspace(1) %arg, i1 %arg2) #0 gc "statepoint-example" personality ptr @wobble {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 1, i32 16, ptr nonnull elementtype(i32 (i32, ptr addrspace(1), i32, i32, i32)) @zot, i32 5, i32 0, i32 undef, ptr addrspace(1) undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0) [ "deopt"(i32 0, i32 0, i32 0, i32 235, i32 3, i32 32, i32 0, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 3, float undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 4, double undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, float undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) null, i32 3, i32 -15108, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null), "gc-live"(ptr addrspace(1) [[ARG:%.*]]) ]
-; CHECK-NEXT:    br i1 false, label [[BB1]], label [[BB2:%.*]]
+; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[RES_LE:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[TMP]], i32 0, i32 0) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[RES_LE:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[TMP]], i32 0, i32 0) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret ptr addrspace(1) [[RES_LE]]
 ;
 bb:
@@ -21,7 +21,7 @@ bb:
 bb1:
   %tmp = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 1, i32 16, ptr nonnull elementtype(i32 (i32, ptr addrspace(1), i32, i32, i32)) @zot, i32 5, i32 0, i32 undef, ptr addrspace(1) undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0) [ "deopt"(i32 0, i32 0, i32 0, i32 235, i32 3, i32 32, i32 0, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 3, float undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 4, double undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, float undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) null, i32 3, i32 -15108, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null), "gc-live"(ptr addrspace(1) %arg) ]
   %res = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %tmp, i32 0, i32 0) readnone
-  br i1 undef, label %bb1, label %bb2
+  br i1 %arg2, label %bb1, label %bb2
 
 bb2:
   ret ptr addrspace(1) %res
diff --git a/llvm/test/Transforms/LICM/hoist-phi.ll b/llvm/test/Transforms/LICM/hoist-phi.ll
index 4d8752d790737..bf999b98a1dac 100644
--- a/llvm/test/Transforms/LICM/hoist-phi.ll
+++ b/llvm/test/Transforms/LICM/hoist-phi.ll
@@ -819,7 +819,7 @@ then:
 ; to check that we have a unique loop preheader when we hoist the store (and so
 ; don't fail an assertion).
 ; CHECK-LABEL: @triangles_in_diamond
-define void @triangles_in_diamond(ptr %ptr) {
+define void @triangles_in_diamond(ptr %ptr, i1 %arg) {
 ; CHECK-LABEL: entry:
 ; CHECK: store i32 0, ptr %ptr, align 4
 ; CHECK: br label %loop
@@ -827,16 +827,16 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %left_triangle_1, label %right_triangle
+  br i1 %arg, label %left_triangle_1, label %right_triangle
 
 left_triangle_1:
-  br i1 undef, label %left_triangle_1_if, label %left_triangle_2
+  br i1 %arg, label %left_triangle_1_if, label %left_triangle_2
 
 left_triangle_1_if:
   br label %left_triangle_2
 
 left_triangle_2:
-  br i1 undef, label %left_triangle_2_if, label %left_triangle_2_then
+  br i1 %arg, label %left_triangle_2_if, label %left_triangle_2_then
 
 left_triangle_2_if:
   br label %left_triangle_2_then
@@ -845,7 +845,7 @@ left_triangle_2_then:
   br label %loop.end
 
 right_triangle:
-  br i1 undef, label %right_triangle.if, label %right_triangle.then
+  br i1 %arg, label %right_triangle.if, label %right_triangle.then
 
 right_triangle.if:
   br label %right_triangle.then
@@ -860,7 +860,7 @@ loop.end:
 
 ; %cmp dominates its used after being hoisted, but not after %brmerge is rehoisted
 ; CHECK-LABEL: @rehoist
-define void @rehoist(ptr %this, i32 %x) {
+define void @rehoist(ptr %this, i32 %x, i1 %arg) {
 ; CHECK-LABEL: entry:
 ; CHECK-DAG: %sub = add nsw i32 %x, -1
 ; CHECK-DAG: %cmp = icmp eq i32 0, %sub
@@ -870,7 +870,7 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %if1, label %else1
+  br i1 %arg, label %if1, label %else1
 
 if1:
   call void %this(ptr %this)
@@ -897,7 +897,7 @@ end:
 ; A test case that uses empty blocks in a way that can cause control flow
 ; hoisting to get confused.
 ; CHECK-LABEL: @empty_blocks_multiple_conditional_branches
-define void @empty_blocks_multiple_conditional_branches(float %arg, ptr %ptr) {
+define void @empty_blocks_multiple_conditional_branches(float %arg, ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: entry
 ; CHECK-DAG: %div1 = fmul float %arg, 4.000000e+00
 ; CHECK-DAG: %div2 = fmul float %arg, 2.000000e+00
@@ -910,10 +910,10 @@ entry:
 ; CHECK: br label %loop
 
 loop:
-  br i1 undef, label %backedge2, label %cond1
+  br i1 %arg2, label %backedge2, label %cond1
 
 cond1:
-  br i1 undef, label %cond1.if, label %cond1.else
+  br i1 %arg2, label %cond1.if, label %cond1.else
 
 cond1.else:
   br label %cond3
@@ -926,7 +926,7 @@ cond1.if.next:
 
 cond2:
   %div1 = fmul float %arg, 4.000000e+00
-  br i1 undef, label %cond2.if, label %cond2.then
+  br i1 %arg2, label %cond2.if, label %cond2.then
 
 cond2.if:
   br label %cond2.then
@@ -939,7 +939,7 @@ cond2.then:
   br label %backedge2
 
 cond3:
-  br i1 undef, label %cond3.then, label %cond3.if
+  br i1 %arg2, label %cond3.then, label %cond3.if
 
 cond3.if:
   %div2 = fmul float %arg, 2.000000e+00
@@ -955,7 +955,7 @@ backedge2:
 
 ; We can't do much here, so mainly just check that we don't crash.
 ; CHECK-LABEL: @many_path_phi
-define void @many_path_phi(ptr %ptr1, ptr %ptr2) {
+define void @many_path_phi(ptr %ptr1, ptr %ptr2, i1 %arg) {
 ; CHECK-LABEL: entry:
 ; CHECK-DAG: %gep3 = getelementptr inbounds i32, ptr %ptr2, i32 2
 ; CHECK-DAG: %gep2 = getelementptr inbounds i32, ptr %ptr2, i32 2
@@ -969,7 +969,7 @@ loop:
   br i1 %cmp1, label %cond2, label %cond1
 
 cond1:
-  br i1 undef, label %end, label %cond1.else
+  br i1 %arg, label %end, label %cond1.else
 
 cond1.else:
   %gep2 = getelementptr inbounds i32, ptr %ptr2, i32 2
@@ -981,7 +981,7 @@ cond1.end:
   br label %end
 
 cond2:
-  br i1 undef, label %end, label %cond2.else
+  br i1 %arg, label %end, label %cond2.else
 
 cond2.else:
   %gep3 = getelementptr inbounds i32, ptr %ptr2, i32 2
@@ -1000,7 +1000,7 @@ end:
 ; Check that we correctly handle the hoisting of %gep when theres a critical
 ; edge that branches to the preheader.
 ; CHECK-LABEL: @crit_edge
-define void @crit_edge(ptr %ptr, i32 %idx, i1 %cond1, i1 %cond2) {
+define void @crit_edge(ptr %ptr, i32 %idx, i1 %cond1, i1 %cond2, i1 %arg) {
 ; CHECK-LABEL: entry:
 ; CHECK: %gep = getelementptr inbounds i32, ptr %ptr, i32 %idx
 ; CHECK: br label %preheader
@@ -1030,7 +1030,7 @@ crit_edge:
 ; Check that the conditional sub is correctly hoisted from the inner loop to the
 ; preheader of the outer loop.
 ; CHECK-LABEL: @hoist_from_innermost_loop
-define void @hoist_from_innermost_loop(i32 %nx, ptr %ptr) {
+define void @hoist_from_innermost_loop(i32 %nx, ptr %ptr, i1 %arg) {
 ; CHECK-LABEL: entry:
 ; CHECK-DAG: %sub = sub nsw i32 0, %nx
 ; CHECK: br label %outer_loop
@@ -1044,7 +1044,7 @@ middle_loop:
   br label %inner_loop
 
 inner_loop:
-  br i1 undef, label %inner_loop_end, label %if
+  br i1 %arg, label %inner_loop_end, label %if
 
 if:
   %sub = sub nsw i32 0, %nx
@@ -1052,10 +1052,10 @@ if:
   br label %inner_loop_end
 
 inner_loop_end:
-  br i1 undef, label %inner_loop, label %middle_loop_end
+  br i1 %arg, label %inner_loop, label %middle_loop_end
 
 middle_loop_end:
-  br i1 undef, label %middle_loop, label %outer_loop_end
+  br i1 %arg, label %middle_loop, label %outer_loop_end
 
 outer_loop_end:
   br label %outer_loop
@@ -1231,7 +1231,7 @@ end:
 ; two destinations that are actually the same. We can't hoist this.
 ; TODO: This could be hoisted by erasing one of the incoming values.
 ; CHECK-LABEL: @phi_multiple_values_same_block
-define i32 @phi_multiple_values_same_block(i32 %arg) {
+define i32 @phi_multiple_values_same_block(i32 %arg, i1 %arg2) {
 ; CHECK-LABEL: entry:
 ; CHECK: %cmp = icmp sgt i32 %arg, 4
 ; CHECK-NOT: phi
@@ -1244,11 +1244,11 @@ loop:
   br i1 %cmp, label %if, label %then
 
 if:
-  br i1 undef, label %then, label %then
+  br i1 %arg2, label %then, label %then
 
 then:
   %phi = phi i32 [ %arg, %loop ], [ 1, %if ], [ 1, %if ]
-  br i1 undef, label %exit, label %loop
+  br i1 %arg2, label %exit, label %loop
 
 exit:
   ret i32 %phi
@@ -1379,12 +1379,12 @@ loop.backedge:
 ; The order that we hoist instructions from the loop is different to the textual
 ; order in the function. Check that we can rehoist this correctly.
 ; CHECK-LABEL: @rehoist_wrong_order_1
-define void @rehoist_wrong_order_1(ptr %ptr) {
+define void @rehoist_wrong_order_1(ptr %ptr, i1 %arg) {
 ; CHECK-LABEL: entry
 ; CHECK-DAG: %gep2 = getelementptr inbounds i32, ptr %ptr, i64 2
 ; CHECK-DAG: %gep3 = getelementptr inbounds i32, ptr %ptr, i64 3
 ; CHECK-DAG: %gep1 = getelementptr inbounds i32, ptr %ptr, i64 1
-; CHECK-ENABLED: br i1 undef, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
+; CHECK-ENABLED: br i1 %arg, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
 entry:
   br label %loop
 
@@ -1395,7 +1395,7 @@ entry:
 ; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM]]
 
 ; CHECK-ENABLED: [[LOOP_BACKEDGE_LICM]]:
-; CHECK-ENABLED: br i1 undef, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
+; CHECK-ENABLED: br i1 %arg, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
 
 ; CHECK-ENABLED: [[IF3_LICM]]:
 ; CHECK-ENABLED: br label %[[END_LICM]]
@@ -1404,7 +1404,7 @@ entry:
 ; CHECK: br label %loop
 
 loop:
-  br i1 undef, label %if1, label %else1
+  br i1 %arg, label %if1, label %else1
 
 if1:
   %gep1 = getelementptr inbounds i32, ptr %ptr, i64 1
@@ -1414,10 +1414,10 @@ if1:
 else1:
   %gep2 = getelementptr inbounds i32, ptr %ptr, i64 2
   store i32 0, ptr %gep2, align 4
-  br i1 undef, label %if2, label %loop.backedge
+  br i1 %arg, label %if2, label %loop.backedge
 
 if2:
-  br i1 undef, label %if3, label %end
+  br i1 %arg, label %if3, label %end
 
 if3:
   %gep3 = getelementptr inbounds i32, ptr %ptr, i64 3
@@ -1433,12 +1433,12 @@ loop.backedge:
 }
 
 ; CHECK-LABEL: @rehoist_wrong_order_2
-define void @rehoist_wrong_order_2(ptr %ptr) {
+define void @rehoist_wrong_order_2(ptr %ptr, i1 %arg) {
 ; CHECK-LABEL: entry
 ; CHECK-DAG: %gep2 = getelementptr inbounds i32, ptr %ptr, i64 2
 ; CHECK-DAG: %gep3 = getelementptr inbounds i32, ptr %gep2, i64 3
 ; CHECK-DAG: %gep1 = getelementptr inbounds i32, ptr %ptr, i64 1
-; CHECK-ENABLED: br i1 undef, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
+; CHECK-ENABLED: br i1 %arg, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
 entry:
   br label %loop
 
@@ -1449,7 +1449,7 @@ entry:
 ; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM]]
 
 ; CHECK-ENABLED: [[LOOP_BACKEDGE_LICM]]:
-; CHECK-ENABLED: br i1 undef, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
+; CHECK-ENABLED: br i1 %arg, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
 
 ; CHECK-ENABLED: [[IF3_LICM]]:
 ; CHECK-ENABLED: br label %[[END_LICM]]
@@ -1458,7 +1458,7 @@ entry:
 ; CHECK: br label %loop
 
 loop:
-  br i1 undef, label %if1, label %else1
+  br i1 %arg, label %if1, label %else1
 
 if1:
   %gep1 = getelementptr inbounds i32, ptr %ptr, i64 1
@@ -1468,10 +1468,10 @@ if1:
 else1:
   %gep2 = getelementptr inbounds i32, ptr %ptr, i64 2
   store i32 0, ptr %gep2, align 4
-  br i1 undef, label %if2, label %loop.backedge
+  br i1 %arg, label %if2, label %loop.backedge
 
 if2:
-  br i1 undef, label %if3, label %end
+  br i1 %arg, label %if3, label %end
 
 if3:
   %gep3 = getelementptr inbounds i32, ptr %gep2, i64 3
@@ -1486,11 +1486,11 @@ loop.backedge:
 }
 
 ; CHECK-LABEL: @rehoist_wrong_order_3
-define void @rehoist_wrong_order_3(ptr %ptr) {
+define void @rehoist_wrong_order_3(ptr %ptr, i1 %arg) {
 ; CHECK-LABEL: entry
 ; CHECK-DAG: %gep2 = getelementptr inbounds i32, ptr %ptr, i64 2
 ; CHECK-DAG: %gep1 = getelementptr inbounds i32, ptr %ptr, i64 1
-; CHECK-ENABLED: br i1 undef, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
+; CHECK-ENABLED: br i1 %arg, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
 entry:
   br label %loop
 
@@ -1503,7 +1503,7 @@ entry:
 ; CHECK-ENABLED: [[IF2_LICM]]:
 ; CHECK-ENABLED: %phi = phi ptr [ %gep1, %[[IF1_LICM]] ], [ %gep2, %[[ELSE1_LICM]] ]
 ; CHECK-ENABLED: %gep3 = getelementptr inbounds i32, ptr %phi, i64 3
-; CHECK-ENABLED: br i1 undef, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
+; CHECK-ENABLED: br i1 %arg, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
 
 ; CHECK-ENABLED: [[IF3_LICM]]:
 ; CHECK-ENABLED: br label %[[END_LICM]]
@@ -1512,7 +1512,7 @@ entry:
 ; CHECK: br label %loop
 
 loop:
-  br i1 undef, label %if1, label %else1
+  br i1 %arg, label %if1, label %else1
 
 if1:
   %gep1 = getelementptr inbounds i32, ptr %ptr, i64 1
@@ -1522,11 +1522,11 @@ if1:
 else1:
   %gep2 = getelementptr inbounds i32, ptr %ptr, i64 2
   store i32 0, ptr %gep2, align 4
-  br i1 undef, label %if2, label %loop.backedge
+  br i1 %arg, label %if2, label %loop.backedge
 
 if2:
   %phi = phi ptr [ %gep1, %if1 ], [ %gep2, %else1 ]
-  br i1 undef, label %if3, label %end
+  br i1 %arg, label %if3, label %end
 
 if3:
   %gep3 = getelementptr inbounds i32, ptr %phi, i64 3
diff --git a/llvm/test/Transforms/LICM/lcssa-ssa-promoter.ll b/llvm/test/Transforms/LICM/lcssa-ssa-promoter.ll
index a68fb0d00af1b..6081040f7cf0e 100644
--- a/llvm/test/Transforms/LICM/lcssa-ssa-promoter.ll
+++ b/llvm/test/Transforms/LICM/lcssa-ssa-promoter.ll
@@ -11,21 +11,21 @@ target triple = "x86_64-unknown-linux-gnu"
 @x = common global i32 0, align 4
 @y = common global i32 0, align 4
 
-define void @PR18688() {
+define void @PR18688(i1 %arg) {
 ; CHECK-LABEL: @PR18688(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[OUTER_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[RETURN:%.*]], label [[OUTER_PREHEADER:%.*]]
 ; CHECK:       outer.preheader:
 ; CHECK-NEXT:    [[Y_VAL:%.*]] = load i32, ptr @y, align 4
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i32 [[Y_VAL]], 0
 ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header:
-; CHECK-NEXT:    br i1 undef, label [[OUTER_LATCH:%.*]], label [[INNER_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[OUTER_LATCH:%.*]], label [[INNER_PREHEADER:%.*]]
 ; CHECK:       inner.preheader:
 ; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
 ; CHECK:       inner.header:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP1:%.*]], [[INNER_LATCH:%.*]] ], [ 0, [[INNER_PREHEADER]] ]
-; CHECK-NEXT:    br i1 undef, label [[INNER_BODY_RHS:%.*]], label [[INNER_LATCH]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[INNER_BODY_RHS:%.*]], label [[INNER_LATCH]]
 ; CHECK:       inner.body.rhs:
 ; CHECK-NEXT:    br label [[INNER_LATCH]]
 ; CHECK:       inner.latch:
@@ -36,7 +36,7 @@ define void @PR18688() {
 ; CHECK-NEXT:    br label [[OUTER_LATCH]]
 ; CHECK:       outer.latch:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTLCSSA1]], [[INNER_EXIT]] ], [ 0, [[OUTER_HEADER]] ]
-; CHECK-NEXT:    br i1 true, label [[OUTER_EXIT:%.*]], label [[OUTER_HEADER]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[OUTER_EXIT:%.*]], label [[OUTER_HEADER]]
 ; CHECK:       outer.exit:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP2]], [[OUTER_LATCH]] ]
 ; CHECK-NEXT:    store i32 [[DOTLCSSA]], ptr @x, align 4
@@ -46,20 +46,20 @@ define void @PR18688() {
 ;
 
 entry:
-  br i1 undef, label %return, label %outer.preheader
+  br i1 %arg, label %return, label %outer.preheader
 
 outer.preheader:
   br label %outer.header
 
 outer.header:
   store i32 0, ptr @x, align 4
-  br i1 undef, label %outer.latch, label %inner.preheader
+  br i1 %arg, label %outer.latch, label %inner.preheader
 
 inner.preheader:
   br label %inner.header
 
 inner.header:
-  br i1 undef, label %inner.body.rhs, label %inner.latch
+  br i1 %arg, label %inner.body.rhs, label %inner.latch
 
 inner.body.rhs:
   store i32 0, ptr @x, align 4
@@ -74,7 +74,7 @@ inner.exit:
   br label %outer.latch
 
 outer.latch:
-  br i1 undef, label %outer.exit, label %outer.header
+  br i1 %arg, label %outer.exit, label %outer.header
 
 outer.exit:
   br label %return
diff --git a/llvm/test/Transforms/LICM/loopsink-pr39570.ll b/llvm/test/Transforms/LICM/loopsink-pr39570.ll
index 2894abdf71eee..d0e8569af71ce 100644
--- a/llvm/test/Transforms/LICM/loopsink-pr39570.ll
+++ b/llvm/test/Transforms/LICM/loopsink-pr39570.ll
@@ -39,8 +39,8 @@
 %32 = type { ptr }
 %33 = type <{ %8, ptr, %10, i32, ptr, ptr, ptr, %27, %28, i16, [2 x i8] }>
 
-define dso_local void @pr39570() local_unnamed_addr align 2 personality ptr @__gxx_personality_v0 !prof !1 {
-  br i1 undef, label %8, label %1, !prof !2
+define dso_local void @pr39570(i1 %arg) local_unnamed_addr align 2 personality ptr @__gxx_personality_v0 !prof !1 {
+  br i1 %arg, label %8, label %1, !prof !2
 
 ; <label>:1:                                      ; preds = %0
   %2 = load ptr, ptr undef, align 4
@@ -52,7 +52,7 @@ define dso_local void @pr39570() local_unnamed_addr align 2 personality ptr @__g
 
 ; <label>:5:                                      ; preds = %3
   %6 = getelementptr inbounds %0, ptr %2, i32 undef, i32 4
-  br i1 undef, label %18, label %7, !prof !3
+  br i1 %arg, label %18, label %7, !prof !3
 
 ; <label>:7:                                      ; preds = %5
   br label %3
diff --git a/llvm/test/Transforms/LICM/outer-loop-deleted-before-licm.ll b/llvm/test/Transforms/LICM/outer-loop-deleted-before-licm.ll
index 71bdd450b22d7..64db812016d92 100644
--- a/llvm/test/Transforms/LICM/outer-loop-deleted-before-licm.ll
+++ b/llvm/test/Transforms/LICM/outer-loop-deleted-before-licm.ll
@@ -39,7 +39,7 @@ for.body467.for.body467_crit_edge:                ; preds = %for.body467
   br i1 false, label %for.end539, label %for.body467
 
 for.end539:                                       ; preds = %for.body467
-  br i1 undef, label %for.body43, label %for.end547
+  br i1 false, label %for.body43, label %for.end547
 
 for.end547:                                       ; preds = %for.body43
   ret void
diff --git a/llvm/test/Transforms/LICM/pr32129.ll b/llvm/test/Transforms/LICM/pr32129.ll
index 79025556eb51f..9a0444a2dd1c1 100644
--- a/llvm/test/Transforms/LICM/pr32129.ll
+++ b/llvm/test/Transforms/LICM/pr32129.ll
@@ -3,18 +3,23 @@
 
 declare void @llvm.experimental.guard(i1, ...)
 
-define void @test() {
-; CHECK-LABEL: define void @test() {
+define void @test(i1 %arg) {
+; CHECK-LABEL: define void @test
+; CHECK-SAME: (i1 [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 0, 400
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[TMP0]], i32 9) [ "deopt"() ]
+; CHECK-NEXT:    br i1 [[ARG]], label [[ENTRY_SPLIT:%.*]], label [[HEADER_SPLIT:%.*]]
+; CHECK:       entry.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
 ; CHECK:       header.loopexit:
 ; CHECK-NEXT:    br label [[HEADER]]
 ; CHECK:       header:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       header.split:
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 true, label [[HEADER_LOOPEXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LOOP1]]
 ;
 entry:
   br label %header
@@ -25,5 +30,5 @@ header:
 loop:
   %0 = icmp ult i32 0, 400
   call void (i1, ...) @llvm.experimental.guard(i1 %0, i32 9) [ "deopt"() ]
-  br i1 undef, label %header, label %loop
+  br i1 %arg, label %header, label %loop
 }
diff --git a/llvm/test/Transforms/LICM/pr37323.ll b/llvm/test/Transforms/LICM/pr37323.ll
index 7e50a52b50d41..4d28065affa80 100644
--- a/llvm/test/Transforms/LICM/pr37323.ll
+++ b/llvm/test/Transforms/LICM/pr37323.ll
@@ -7,23 +7,23 @@ target triple = "x86_64-unknown-linux-gnu"
 ;Make sure this test do not crash while accessing PostDomTree which is not
 ;preserved in LICM.
 ;
-;CHECK-LABEL: fn1()
+;CHECK-LABEL: fn1(i1 %arg)
 ;CHECK-LABEL: for.cond.loopexit.split.loop.exit
 ;CHECK-LABEL: for.cond.loopexit.split.loop.exit1
-define void @fn1() {
+define void @fn1(i1 %arg) {
 entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %if.end, %for.cond1, %entry
   %0 = phi i16 [ undef, %entry ], [ ptrtoint (ptr @c to i16), %if.end ], [ %.mux, %for.cond1 ]
-  br i1 undef, label %for.cond1, label %for.end8
+  br i1 %arg, label %for.cond1, label %for.end8
 
 for.cond1:                                        ; preds = %if.end, %for.cond
   %.mux = select i1 undef, i16 undef, i16 ptrtoint (ptr @c to i16)
-  br i1 undef, label %for.cond, label %if.end
+  br i1 %arg, label %for.cond, label %if.end
 
 if.end:                                           ; preds = %for.cond1
-  br i1 undef, label %for.cond, label %for.cond1
+  br i1 %arg, label %for.cond, label %for.cond1
 
 for.end8:                                         ; preds = %for.cond
   ret void
diff --git a/llvm/test/Transforms/LICM/pr38513.ll b/llvm/test/Transforms/LICM/pr38513.ll
index a08194cd4145a..42f57133e2362 100644
--- a/llvm/test/Transforms/LICM/pr38513.ll
+++ b/llvm/test/Transforms/LICM/pr38513.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -disable-basic-aa -alias-set-saturation-threshold=2 -passes='loop-mssa(licm)' -S < %s | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK-LABEL: @f1()
-define void @f1() {
+; CHECK-LABEL: @f1(i1 %arg)
+define void @f1(i1 %arg) {
   %lc1.10 = alloca [3 x i16]
   br label %bb1
 
@@ -19,7 +19,7 @@ bb2:                                              ; preds = %bb8, %bb1
   br label %bb8
 
 bb8:                                              ; preds = %bb2
-  br i1 undef, label %bb2, label %bb6
+  br i1 %arg, label %bb2, label %bb6
 
 bb6:                                              ; preds = %bb8
   br label %bb1
diff --git a/llvm/test/Transforms/LICM/pr50367.ll b/llvm/test/Transforms/LICM/pr50367.ll
index 3f2b1a5303874..a7cf21deff627 100644
--- a/llvm/test/Transforms/LICM/pr50367.ll
+++ b/llvm/test/Transforms/LICM/pr50367.ll
@@ -2,14 +2,14 @@
 ; RUN: opt -S -passes='loop-mssa(licm)' < %s | FileCheck %s
 @e = external dso_local global ptr, align 8
 
-define void @main() {
+define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop1:
 ; CHECK-NEXT:    br label [[LOOP2:%.*]]
 ; CHECK:       loop2:
-; CHECK-NEXT:    br i1 false, label [[LOOP2_LATCH:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[LOOP2_LATCH:%.*]], label [[LOOP_LATCH:%.*]]
 ; CHECK:       loop2.latch:
 ; CHECK-NEXT:    br label [[LOOP2]]
 ; CHECK:       loop.latch:
@@ -22,7 +22,7 @@ loop1:
   br label %loop2
 
 loop2:
-  br i1 undef, label %loop2.latch, label %loop.latch
+  br i1 %arg, label %loop2.latch, label %loop.latch
 
 loop2.latch:
   store i32 0, ptr null, align 4
diff --git a/llvm/test/Transforms/LICM/sink-promote.ll b/llvm/test/Transforms/LICM/sink-promote.ll
index 8a90f0b40340d..418b40c22b49c 100644
--- a/llvm/test/Transforms/LICM/sink-promote.ll
+++ b/llvm/test/Transforms/LICM/sink-promote.ll
@@ -4,7 +4,7 @@
 ; a memory location in a loop.
 ; Store can be sunk out of exit block containing indirectbr instructions after
 ; D50925. Updated to use an argument instead of undef, due to PR38989.
-define void @test12(ptr %ptr) {
+define void @test12(ptr %ptr, ptr %arg) {
 ; CHECK-LABEL: @test12
 ; CHECK: store
 ; CHECK-NEXT: br label %lab4
@@ -20,10 +20,10 @@ lab6:
   br label %lab4
 
 lab7:
-  br i1 undef, label %lab8, label %lab13
+  br i1 false, label %lab8, label %lab13
 
 lab8:
-  br i1 undef, label %lab13, label %lab10
+  br i1 true, label %lab13, label %lab10
 
 lab10:
   br label %lab7
@@ -39,12 +39,12 @@ lab21:
 ; CHECK-NOT: store
 ; CHECK: br i1 false, label %lab21, label %lab22
   store i32 36127957, ptr %ptr, align 4
-  br i1 undef, label %lab21, label %lab22
+  br i1 false, label %lab21, label %lab22
 
 lab22:
 ; CHECK: lab22:
 ; CHECK-NOT: store
-; CHECK-NEXT: indirectbr ptr undef
-  indirectbr ptr undef, [label %lab5, label %lab6, label %lab7]
+; CHECK-NEXT: indirectbr ptr %arg 
+  indirectbr ptr %arg, [label %lab5, label %lab6, label %lab7]
 }
 
diff --git a/llvm/test/Transforms/LICM/sinking.ll b/llvm/test/Transforms/LICM/sinking.ll
index 5888f2dcf65b0..e7ac07b50625a 100644
--- a/llvm/test/Transforms/LICM/sinking.ll
+++ b/llvm/test/Transforms/LICM/sinking.ll
@@ -482,15 +482,15 @@ exit:
 ; Test that we don't crash when trying to sink stores and there's no preheader
 ; available (which is used for creating loads that may be used by the SSA
 ; updater)
-define void @test13() {
+define void @test13(i1 %arg) {
 ; CHECK-LABEL: @test13(
 ; CHECK-NEXT:    br label [[LAB59:%.*]]
 ; CHECK:       lab19:
-; CHECK-NEXT:    br i1 false, label [[LAB20:%.*]], label [[LAB38_LOOPEXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[LAB20:%.*]], label [[LAB38_LOOPEXIT:%.*]]
 ; CHECK:       lab20:
 ; CHECK-NEXT:    br label [[LAB60:%.*]]
 ; CHECK:       lab21:
-; CHECK-NEXT:    br i1 undef, label [[LAB22:%.*]], label [[LAB38:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[LAB22:%.*]], label [[LAB38:%.*]]
 ; CHECK:       lab22:
 ; CHECK-NEXT:    br label [[LAB38]]
 ; CHECK:       lab38.loopexit:
@@ -506,13 +506,13 @@ define void @test13() {
   br label %lab59
 
 lab19:
-  br i1 undef, label %lab20, label %lab38
+  br i1 %arg, label %lab20, label %lab38
 
 lab20:
   br label %lab60
 
 lab21:
-  br i1 undef, label %lab22, label %lab38
+  br i1 %arg, label %lab22, label %lab38
 
 lab22:
   br label %lab38
@@ -886,16 +886,16 @@ try.cont:
 ; The sinkable call should be sunk into an exit block split. After splitting
 ; the exit block, BlockColor for new blocks should be added properly so
 ; that we should be able to access valid ColorVector.
-define i32 @test21_pr36184(ptr %P) personality ptr @__CxxFrameHandler3 {
+define i32 @test21_pr36184(ptr %P, i1 %arg) personality ptr @__CxxFrameHandler3 {
 ; CHECK-LABEL: @test21_pr36184(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_PH:%.*]]
 ; CHECK:       loop.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       Loop:
-; CHECK-NEXT:    br i1 false, label [[CONTLOOP:%.*]], label [[OUT_SPLIT_LOOP_EXIT1:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[CONTLOOP:%.*]], label [[OUT_SPLIT_LOOP_EXIT1:%.*]]
 ; CHECK:       ContLoop:
-; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[OUT_SPLIT_LOOP_EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[LOOP]], label [[OUT_SPLIT_LOOP_EXIT:%.*]]
 ; CHECK:       Out.split.loop.exit:
 ; CHECK-NEXT:    [[IDX_PH:%.*]] = phi i32 [ 0, [[CONTLOOP]] ]
 ; CHECK-NEXT:    br label [[OUT:%.*]]
@@ -914,10 +914,10 @@ loop.ph:
 
 Loop:
   %sinkableCall = call i32 @strlen( ptr %P ) readonly
-  br i1 undef, label %ContLoop, label %Out
+  br i1 %arg, label %ContLoop, label %Out
 
 ContLoop:
-  br i1 undef, label %Loop, label %Out
+  br i1 %arg, label %Loop, label %Out
 
 Out:
   %idx = phi i32 [ %sinkableCall, %Loop ], [0, %ContLoop ]
diff --git a/llvm/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll b/llvm/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll
index 7518f5dee4d0b..48d87acdba5bc 100644
--- a/llvm/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll
+++ b/llvm/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll
@@ -13,9 +13,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 %"class.llvm::SmallVectorTemplateCommon" = type { %"class.llvm::SmallVectorBase" }
 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
 
-define void @_ZNK5clang6driver7ArgList20AddAllArgsTranslatedERN4llvm11SmallVectorIPKcLj16EEENS0_12OptSpecifierES5_b(i1 zeroext %Joined) nounwind align 2 {
+define void @_ZNK5clang6driver7ArgList20AddAllArgsTranslatedERN4llvm11SmallVectorIPKcLj16EEENS0_12OptSpecifierES5_b(i1 zeroext %Joined, i1 %arg) nounwind align 2 {
 entry:
-  br i1 undef, label %entry.split.us, label %entry.entry.split_crit_edge
+  br i1 %arg, label %entry.split.us, label %entry.entry.split_crit_edge
 
 entry.entry.split_crit_edge:                      ; preds = %entry
   br label %entry.split
@@ -39,10 +39,10 @@ entry.split:                                      ; preds = %entry.entry.split_c
   br label %for.cond.i14
 
 for.cond.i14:                                     ; preds = %for.inc.i38, %entry.split
-  br i1 undef, label %for.cond.i50.us-lcssa, label %if.end.i23
+  br i1 %arg, label %for.cond.i50.us-lcssa, label %if.end.i23
 
 if.end.i23:                                       ; preds = %for.cond.i14
-  br i1 undef, label %for.cond.i50.us-lcssa, label %for.inc.i38
+  br i1 %arg, label %for.cond.i50.us-lcssa, label %for.inc.i38
 
 for.inc.i38:                                      ; preds = %if.end.i23
   br label %for.cond.i14
@@ -60,52 +60,52 @@ for.cond.loopexit:                                ; preds = %for.cond.loopexit.u
   br label %for.cond
 
 for.cond:                                         ; preds = %for.cond.loopexit, %for.cond.i50
-  br i1 undef, label %for.end, label %for.body
+  br i1 %arg, label %for.end, label %for.body
 
 for.body:                                         ; preds = %for.cond
   br i1 %Joined, label %if.then, label %if.else
 
 if.then:                                          ; preds = %for.body
-  br i1 undef, label %cond.false.i.i, label %_ZN4llvm9StringRefC1EPKc.exit
+  br i1 %arg, label %cond.false.i.i, label %_ZN4llvm9StringRefC1EPKc.exit
 
 cond.false.i.i:                                   ; preds = %if.then
   unreachable
 
 _ZN4llvm9StringRefC1EPKc.exit:                    ; preds = %if.then
-  br i1 undef, label %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit, label %cond.false.i.i91
+  br i1 %arg, label %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit, label %cond.false.i.i91
 
 cond.false.i.i91:                                 ; preds = %_ZN4llvm9StringRefC1EPKc.exit
   unreachable
 
 _ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit: ; preds = %_ZN4llvm9StringRefC1EPKc.exit
-  br i1 undef, label %cond.false.i.i.i, label %if.end13.i.i.i.i
+  br i1 %arg, label %cond.false.i.i.i, label %if.end13.i.i.i.i
 
 if.end13.i.i.i.i:                                 ; preds = %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit
-  br i1 undef, label %land.lhs.true16.i.i.i.i, label %if.end19.i.i.i.i
+  br i1 %arg, label %land.lhs.true16.i.i.i.i, label %if.end19.i.i.i.i
 
 land.lhs.true16.i.i.i.i:                          ; preds = %if.end13.i.i.i.i
-  br i1 undef, label %cond.false.i.i.i, label %_ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i
+  br i1 %arg, label %cond.false.i.i.i, label %_ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i
 
 _ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i:         ; preds = %land.lhs.true16.i.i.i.i
-  br i1 undef, label %cond.false.i.i.i, label %if.end19.i.i.i.i
+  br i1 %arg, label %cond.false.i.i.i, label %if.end19.i.i.i.i
 
 if.end19.i.i.i.i:                                 ; preds = %_ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i, %if.end13.i.i.i.i
-  br i1 undef, label %land.lhs.true22.i.i.i.i, label %_ZN4llvmplERKNS_9StringRefEPKc.exit
+  br i1 %arg, label %land.lhs.true22.i.i.i.i, label %_ZN4llvmplERKNS_9StringRefEPKc.exit
 
 land.lhs.true22.i.i.i.i:                          ; preds = %if.end19.i.i.i.i
-  br i1 undef, label %cond.false.i.i.i, label %_ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i
+  br i1 %arg, label %cond.false.i.i.i, label %_ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i
 
 _ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i:          ; preds = %land.lhs.true22.i.i.i.i
-  br i1 undef, label %cond.false.i.i.i, label %_ZN4llvmplERKNS_9StringRefEPKc.exit
+  br i1 %arg, label %cond.false.i.i.i, label %_ZN4llvmplERKNS_9StringRefEPKc.exit
 
 cond.false.i.i.i:                                 ; preds = %_ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i, %land.lhs.true22.i.i.i.i, %_ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i, %land.lhs.true16.i.i.i.i, %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit
   unreachable
 
 _ZN4llvmplERKNS_9StringRefEPKc.exit:              ; preds = %_ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i, %if.end19.i.i.i.i
-  br i1 undef, label %Retry.i, label %if.end.i99
+  br i1 %arg, label %Retry.i, label %if.end.i99
 
 Retry.i:                                          ; preds = %if.end.i99, %_ZN4llvmplERKNS_9StringRefEPKc.exit
-  br i1 undef, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit, label %new.notnull.i
+  br i1 %arg, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit, label %new.notnull.i
 
 new.notnull.i:                                    ; preds = %Retry.i
   br label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit
@@ -117,10 +117,10 @@ _ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit: ; preds = %new.notnull.i, %
   br label %for.cond.i.preheader
 
 if.else:                                          ; preds = %for.body
-  br i1 undef, label %Retry.i108, label %if.end.i113
+  br i1 %arg, label %Retry.i108, label %if.end.i113
 
 Retry.i108:                                       ; preds = %if.end.i113, %if.else
-  br i1 undef, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114, label %new.notnull.i110
+  br i1 %arg, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114, label %new.notnull.i110
 
 new.notnull.i110:                                 ; preds = %Retry.i108
   br label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114
@@ -129,16 +129,16 @@ if.end.i113:                                      ; preds = %if.else
   br label %Retry.i108
 
 _ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114: ; preds = %new.notnull.i110, %Retry.i108
-  br i1 undef, label %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit125, label %cond.false.i.i123
+  br i1 %arg, label %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit125, label %cond.false.i.i123
 
 cond.false.i.i123:                                ; preds = %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114
   unreachable
 
 _ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit125: ; preds = %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114
-  br i1 undef, label %Retry.i134, label %if.end.i139
+  br i1 %arg, label %Retry.i134, label %if.end.i139
 
 Retry.i134:                                       ; preds = %if.end.i139, %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit125
-  br i1 undef, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140, label %new.notnull.i136
+  br i1 %arg, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140, label %new.notnull.i136
 
 new.notnull.i136:                                 ; preds = %Retry.i134
   br label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140
@@ -150,7 +150,7 @@ _ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140: ; preds = %new.notnull.i
   br label %for.cond.i.preheader
 
 for.cond.i.preheader:                             ; preds = %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140, %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit
-  br i1 undef, label %for.cond.i.preheader.split.us, label %for.cond.i.preheader.for.cond.i.preheader.split_crit_edge
+  br i1 %arg, label %for.cond.i.preheader.split.us, label %for.cond.i.preheader.for.cond.i.preheader.split_crit_edge
 
 for.cond.i.preheader.for.cond.i.preheader.split_crit_edge: ; preds = %for.cond.i.preheader
   br label %for.cond.i.preheader.split
@@ -172,10 +172,10 @@ for.cond.i.preheader.split:                       ; preds = %for.cond.i.preheade
   br label %for.cond.i
 
 for.cond.i:                                       ; preds = %if.end.i, %for.cond.i.preheader.split
-  br i1 undef, label %for.cond.loopexit.us-lcssa, label %if.end.i
+  br i1 %arg, label %for.cond.loopexit.us-lcssa, label %if.end.i
 
 if.end.i:                                         ; preds = %for.cond.i
-  br i1 undef, label %for.cond.loopexit.us-lcssa, label %for.cond.i
+  br i1 %arg, label %for.cond.loopexit.us-lcssa, label %for.cond.i
 
 for.end:                                          ; preds = %for.cond
   ret void
diff --git a/llvm/test/Transforms/LoopDeletion/2017-07-11-incremental-dt.ll b/llvm/test/Transforms/LoopDeletion/2017-07-11-incremental-dt.ll
index 6c54a2a2ce17a..e944a8fe15fc4 100644
--- a/llvm/test/Transforms/LoopDeletion/2017-07-11-incremental-dt.ll
+++ b/llvm/test/Transforms/LoopDeletion/2017-07-11-incremental-dt.ll
@@ -17,19 +17,19 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %entry
-  br i1 undef, label %lbl63A679E5, label %for.body
+  br i1 true, label %lbl63A679E5, label %for.body
 
 for.body:                                         ; preds = %for.cond
   br label %for.cond1
 
 for.cond1:                                        ; preds = %for.cond1, %for.body
-  br i1 undef, label %for.cond1, label %for.cond3.loopexit
+  br i1 false, label %for.cond1, label %for.cond3.loopexit
 
 for.cond3.loopexit:                               ; preds = %for.cond1
   br label %for.cond3
 
 for.cond3:                                        ; preds = %for.cond9, %for.cond3.loopexit
-  br i1 undef, label %for.body4, label %for.cond17
+  br i1 false, label %for.body4, label %for.cond17
 
 for.body4:                                        ; preds = %for.cond3
   br label %for.cond5
@@ -41,7 +41,7 @@ lbl63A679E5:                                      ; preds = %for.cond
   br label %for.cond5
 
 for.cond9:                                        ; preds = %for.end14.split, %for.cond5
-  br i1 undef, label %for.cond3, label %lbl64774A9B
+  br i1 false, label %for.cond3, label %lbl64774A9B
 
 lbl64774A9B:                                      ; preds = %for.cond17, %for.cond9
   br label %for.end14.split
diff --git a/llvm/test/Transforms/LoopDeletion/bbi-59728.ll b/llvm/test/Transforms/LoopDeletion/bbi-59728.ll
index bf0ba372e5e3b..8f2ea7d09af12 100644
--- a/llvm/test/Transforms/LoopDeletion/bbi-59728.ll
+++ b/llvm/test/Transforms/LoopDeletion/bbi-59728.ll
@@ -20,7 +20,7 @@ for.body45:                                       ; preds = %for.end72, %entry
 
 for.body48:                                       ; preds = %for.body48, %for.body45
   store i32 433429641, ptr undef, align 1
-  br i1 undef, label %for.body48, label %for.end72
+  br i1 false, label %for.body48, label %for.end72
 
 for.end72:                                        ; preds = %for.body48
   br label %for.body45
diff --git a/llvm/test/Transforms/LoopDeletion/crashbc.ll b/llvm/test/Transforms/LoopDeletion/crashbc.ll
index c01453bbda817..c230dcf62d2ba 100644
--- a/llvm/test/Transforms/LoopDeletion/crashbc.ll
+++ b/llvm/test/Transforms/LoopDeletion/crashbc.ll
@@ -2,12 +2,12 @@
 ; RUN: opt < %s -passes=loop-deletion -o /dev/null
 ; RUN: opt < %s -passes=loop-deletion -o /dev/null --try-experimental-debuginfo-iterators
 
-define void @f() {
+define void @f(i1 %arg) {
   br label %bb1
 
 bb1:                                              ; preds = %bb1, %0
   call void @llvm.dbg.value(metadata i16 undef, metadata !1, metadata !DIExpression()), !dbg !11
-  br i1 undef, label %bb1, label %bb3
+  br i1 %arg, label %bb1, label %bb3
 
 bb3:                                              ; preds = %bb1
   ret void
diff --git a/llvm/test/Transforms/LoopDeletion/pr53969.ll b/llvm/test/Transforms/LoopDeletion/pr53969.ll
index 3b8904e4457f8..45c7ee2aecbbe 100644
--- a/llvm/test/Transforms/LoopDeletion/pr53969.ll
+++ b/llvm/test/Transforms/LoopDeletion/pr53969.ll
@@ -66,7 +66,7 @@ bb1:                                              ; preds = %bb31, %bb
   br i1 %tmp10, label %bb33, label %bb34
 
 bb11:                                             ; preds = %bb34
-  br i1 undef, label %bb33, label %bb34
+  br i1 true, label %bb33, label %bb34
 
 bb12:                                             ; preds = %bb34
   %tmp13 = icmp eq ptr addrspace(1) undef, null
@@ -113,7 +113,7 @@ bb34:                                             ; preds = %bb11, %bb1
   %tmp39 = add i32 %tmp38, undef
   %tmp40 = sext i32 %tmp39 to i64
   %tmp41 = add i64 undef, %tmp40
-  br i1 undef, label %bb11, label %bb12
+  br i1 false, label %bb11, label %bb12
 
 bb42:                                             ; preds = %bb22
   store atomic i64 %tmp18, ptr addrspace(1) undef unordered, align 8
diff --git a/llvm/test/Transforms/LoopDeletion/simplify-then-delete.ll b/llvm/test/Transforms/LoopDeletion/simplify-then-delete.ll
index 664578ef9fe08..529ee8919bdb3 100644
--- a/llvm/test/Transforms/LoopDeletion/simplify-then-delete.ll
+++ b/llvm/test/Transforms/LoopDeletion/simplify-then-delete.ll
@@ -7,7 +7,7 @@
 
 target datalayout = "e-p:64:64:64"
 
-define i32 @pmat(i32 %m, i32 %n, ptr %y) nounwind {
+define i32 @pmat(i32 %m, i32 %n, ptr %y, i1 %arg) nounwind {
 ; CHECK-LABEL: @pmat(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[M:%.*]], 0
@@ -45,7 +45,7 @@ w.c2.p:
   br i1 false, label %bb.n, label %w.e
 
 w.c2:
-  br i1 undef, label %w.b6, label %w.c2.w.ec
+  br i1 %arg, label %w.b6, label %w.c2.w.ec
 
 w.c2.w.ec:
   br label %w.e
diff --git a/llvm/test/Transforms/LoopIdiom/non-canonical-loop.ll b/llvm/test/Transforms/LoopIdiom/non-canonical-loop.ll
index aeebdfa33907a..eb93b6fbdab1a 100644
--- a/llvm/test/Transforms/LoopIdiom/non-canonical-loop.ll
+++ b/llvm/test/Transforms/LoopIdiom/non-canonical-loop.ll
@@ -5,15 +5,15 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define void @test(ptr %currMB) nounwind uwtable {
+define void @test(ptr %currMB, i1 %arg, ptr %arg2) nounwind uwtable {
 entry:
-  br i1 undef, label %start.exit, label %if.then.i
+  br i1 %arg, label %start.exit, label %if.then.i
 
 if.then.i:                                        ; preds = %entry
   unreachable
 
 start.exit:                       ; preds = %entry
-  indirectbr ptr undef, [label %0, label %for.bodyprime]
+  indirectbr ptr %arg2, [label %0, label %for.bodyprime]
 
 ; <label>:0                                       ; preds = %start.exit
   unreachable
diff --git a/llvm/test/Transforms/LoopIdiom/scev-invalidation_topmostloop.ll b/llvm/test/Transforms/LoopIdiom/scev-invalidation_topmostloop.ll
index f9eeb55aecb7d..0bf047d3b8759 100644
--- a/llvm/test/Transforms/LoopIdiom/scev-invalidation_topmostloop.ll
+++ b/llvm/test/Transforms/LoopIdiom/scev-invalidation_topmostloop.ll
@@ -2,9 +2,9 @@
 
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK-LABEL: @f1()
+; CHECK-LABEL: @f1(i1 %arg)
 ; CHECK-NEXT: entry:
-define void @f1() {
+define void @f1(i1 %arg) {
 entry:
   br label %lbl1
 
@@ -15,10 +15,10 @@ for:                                              ; preds = %if.end, %lbl1
   br label %lor.end
 
 lor.end:                                          ; preds = %for
-  br i1 undef, label %for.end, label %if.end
+  br i1 %arg, label %for.end, label %if.end
 
 if.end:                                           ; preds = %lor.end
-  br i1 undef, label %lbl1, label %for
+  br i1 %arg, label %lbl1, label %for
 
 for.end:                                          ; preds = %lor.end
   ret void
diff --git a/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll b/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll
new file mode 100644
index 0000000000000..788e1b0157d80
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll
@@ -0,0 +1,65 @@
+; REQUIRES: asserts
+
+; RUN: opt < %s -passes=loop-interchange -debug -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+
+@N = dso_local global i32 0, align 4
+@a = dso_local global ptr null, align 8
+@b = dso_local global ptr null, align 8
+@c = dso_local global ptr null, align 8
+
+; Loop interchange should not run delinearization
+; for one loop case and should bail out early.
+
+; CHECK-NOT: Delinearizing
+; CHECK-NOT: Strides:
+; CHECK-NOT: Terms:
+; CHECK: Loop doesn't contain minimum nesting level.
+
+define void @foo() {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr @N, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load ptr, ptr @b, align 8
+  %3 = load i32, ptr %i, align 4
+  %idxprom = zext i32 %3 to i64
+  %arrayidx = getelementptr inbounds nuw i32, ptr %2, i64 %idxprom
+  %4 = load i32, ptr %arrayidx, align 4
+  %5 = load ptr, ptr @c, align 8
+  %6 = load i32, ptr %i, align 4
+  %idxprom1 = zext i32 %6 to i64
+  %arrayidx2 = getelementptr inbounds nuw i32, ptr %5, i64 %idxprom1
+  %7 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %4, %7
+  %8 = load ptr, ptr @a, align 8
+  %9 = load i32, ptr %i, align 4
+  %idxprom3 = zext i32 %9 to i64
+  %arrayidx4 = getelementptr inbounds nuw i32, ptr %8, i64 %idxprom3
+  store i32 %add, ptr %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %10 = load i32, ptr %i, align 4
+  %inc = add i32 %10, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopInterchange/call-instructions-remarks.ll b/llvm/test/Transforms/LoopInterchange/call-instructions-remarks.ll
new file mode 100644
index 0000000000000..4ed3e7e0f5d00
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/call-instructions-remarks.ll
@@ -0,0 +1,63 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info 2>&1
+; RUN: FileCheck --input-file=%t %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+
+declare void @foo(i64 %a)
+declare void @bar(i64 %a) readnone
+
+;;--------------------------------------Test case 01------------------------------------
+;; Not safe to interchange, because the called function `foo` is not marked as
+;; readnone, so it could introduce dependences.
+;;
+;;  for(int i=0;i<100;i++) {
+;;    for(int j=1;j<100;j++) {
+;;      foo(i);
+;;      A[j][i] = A[j][i]+k;
+;;    }
+;; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            CallInst
+; CHECK-NEXT: Function:        interchange_01
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String:          Cannot interchange loops due to call instruction.
+
+define void @interchange_01(i32 %k) {
+entry:
+  br label %for1.header
+
+for1.header:
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc10 ]
+  br label %for2
+
+for2:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for2 ], [ 1, %for1.header ]
+  call void @foo(i64 %indvars.iv23)
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], ptr @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+  %lv = load i32, ptr %arrayidx5
+  %add = add nsw i32 %lv, %k
+  store i32 %add, ptr %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 99
+  br i1 %exitcond, label %for2.loopexit , label %for2
+
+for2.loopexit:
+  br label %for1.inc10
+
+for1.inc10:
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %exitcond26 = icmp eq i64 %indvars.iv23, 99
+  br i1 %exitcond26, label %for1.loopexit, label %for1.header
+
+for1.loopexit:
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/call-instructions.ll b/llvm/test/Transforms/LoopInterchange/call-instructions.ll
index 49e877aa0d36e..b207166302d21 100644
--- a/llvm/test/Transforms/LoopInterchange/call-instructions.ll
+++ b/llvm/test/Transforms/LoopInterchange/call-instructions.ll
@@ -1,7 +1,5 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
-; RUN:     -verify-dom-info -verify-loop-info -stats 2>&1 | FileCheck -check-prefix=STATS %s
-; RUN: FileCheck --input-file=%t %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-interchange -S -verify-dom-info -verify-loop-info 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -10,58 +8,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 declare void @foo(i64 %a)
 declare void @bar(i64 %a) readnone
 
-;;--------------------------------------Test case 01------------------------------------
-;; Not safe to interchange, because the called function `foo` is not marked as
-;; readnone, so it could introduce dependences.
-;;
-;;  for(int i=0;i<100;i++) {
-;;    for(int j=1;j<100;j++) {
-;;      foo(i);
-;;      A[j][i] = A[j][i]+k;
-;;    }
-;; }
-
-; CHECK: --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            CallInst
-; CHECK-NEXT: Function:        interchange_01
-; CHECK-NEXT: Args:
-; CHECK-NEXT: - String:          Cannot interchange loops due to call instruction.
-
-define void @interchange_01(i32 %k) {
-entry:
-  br label %for1.header
-
-for1.header:
-  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc10 ]
-  br label %for2
-
-for2:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for2 ], [ 1, %for1.header ]
-  call void @foo(i64 %indvars.iv23)
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], ptr @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
-  %lv = load i32, ptr %arrayidx5
-  %add = add nsw i32 %lv, %k
-  store i32 %add, ptr %arrayidx5
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv, 99
-  br i1 %exitcond, label %for2.loopexit , label %for2
-
-for2.loopexit:
-  br label %for1.inc10
-
-for1.inc10:
-  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-  %exitcond26 = icmp eq i64 %indvars.iv23, 99
-  br i1 %exitcond26, label %for1.loopexit, label %for1.header
-
-for1.loopexit:
-  br label %exit
-
-exit:
-  ret void
-}
-
 ;;--------------------------------------Test case 02------------------------------------
 ;; Safe to interchange, because the called function `bar` is marked as readnone,
 ;; so it cannot introduce dependences.
@@ -72,16 +18,46 @@ exit:
 ;;      A[j][i] = A[j][i]+k;
 ;;    }
 ;; }
-
-; CHECK: --- !Passed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        interchange_02
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
-; CHECK-NEXT: ...
-
+;
 define void @interchange_02(i32 %k) {
+; CHECK-LABEL: define void @interchange_02(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR2_PREHEADER:.*]]
+; CHECK:       [[FOR1_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR1_HEADER:.*]]
+; CHECK:       [[FOR1_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], %[[FOR1_INC10:.*]] ], [ 0, %[[FOR1_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR2_SPLIT1:.*]]
+; CHECK:       [[FOR2_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR2:.*]]
+; CHECK:       [[FOR2]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0:%.*]], %[[FOR2_SPLIT:.*]] ], [ 1, %[[FOR2_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR1_HEADER_PREHEADER]]
+; CHECK:       [[FOR2_SPLIT1]]:
+; CHECK-NEXT:    call void @bar(i64 [[INDVARS_IV23]])
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x [100 x i32]], ptr @A, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV23]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[LV]], [[K]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 99
+; CHECK-NEXT:    br label %[[FOR2_LOOPEXIT:.*]]
+; CHECK:       [[FOR2_SPLIT]]:
+; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDVARS_IV]], 99
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR1_LOOPEXIT:.*]], label %[[FOR2]]
+; CHECK:       [[FOR2_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR1_INC10]]
+; CHECK:       [[FOR1_INC10]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1
+; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i64 [[INDVARS_IV23]], 99
+; CHECK-NEXT:    br i1 [[EXITCOND26]], label %[[FOR2_SPLIT]], label %[[FOR1_HEADER]]
+; CHECK:       [[FOR1_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for1.header
 
@@ -114,6 +90,3 @@ for1.loopexit:
 exit:
   ret void
 }
-
-; Check stats, we interchanged 1 out of 2 loops.
-; STATS: 1 loop-interchange - Number of loops interchanged
diff --git a/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar-remark.ll b/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar-remark.ll
new file mode 100644
index 0000000000000..51fa6469cdbbb
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar-remark.ll
@@ -0,0 +1,117 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info -S -debug 2>&1 | FileCheck %s
+
+@A = common global [100 x [100 x i64]] zeroinitializer
+@N = dso_local local_unnamed_addr global i64 100, align 8
+
+;  for(int i=0;i<100;i++)
+;    for(int j=0;j<i;j++)
+;      A[j][i] = A[j][i]+k;
+;
+; Inner loop induction variable exit condition depends on the
+; outer loop induction variable, i.e., triangular loops.
+;
+; CHECK: Loop structure not understood by pass
+; CHECK: Not interchanging loops. Cannot prove legality.
+;
+define void @interchange_01(i64 %k) {
+entry:
+  br label %for1.header
+
+for1.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for1.inc10 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %j.next, %for2 ], [ 0, %for1.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], ptr @A, i64 0, i64 %j, i64 %i
+  %lv = load i64, ptr %arrayidx5
+  %add = add nsw i64 %lv, %k
+  store i64 %add, ptr %arrayidx5
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond = icmp eq i64 %j, %i
+  br i1 %exitcond, label %for1.inc10, label %for2
+
+for1.inc10:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond26 = icmp eq i64 %i, 99
+  br i1 %exitcond26, label %for.end12, label %for1.header
+
+for.end12:
+  ret void
+}
+
+
+;  for(int i=0;i<100;i++)
+;    for(int j=0;j+i<100;j++)
+;      A[j][i] = A[j][i]+k;
+;
+; Inner loop induction variable exit condition depends on the
+; outer loop induction variable, i.e., triangular loops.
+;
+; CHECK: Loop structure not understood by pass
+; CHECK: Not interchanging loops. Cannot prove legality.
+;
+define void @interchange_02(i64 %k) {
+entry:
+  br label %for1.header
+
+for1.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for1.inc10 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %j.next, %for2 ], [ 0, %for1.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], ptr @A, i64 0, i64 %j, i64 %i
+  %lv = load i64, ptr %arrayidx5
+  %add = add nsw i64 %lv, %k
+  store i64 %add, ptr %arrayidx5
+  %0 = add nuw nsw i64 %j, %i
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond = icmp eq i64 %0, 100
+  br i1 %exitcond, label %for1.inc10, label %for2
+
+for1.inc10:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond26 = icmp eq i64 %i, 99
+  br i1 %exitcond26, label %for.end12, label %for1.header
+
+for.end12:
+  ret void
+}
+
+;  for(int i=0;i<100;i++)
+;    for(int j=0;i>j;j++)
+;      A[j][i] = A[j][i]+k;
+;
+; Inner loop induction variable exit condition depends on the
+; outer loop induction variable, i.e., triangular loops.
+; CHECK: Loop structure not understood by pass
+; CHECK: Not interchanging loops. Cannot prove legality.
+;
+define void @interchange_03(i64 %k) {
+entry:
+  br label %for1.header
+
+for1.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for1.inc10 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %j.next, %for2 ], [ 0, %for1.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], ptr @A, i64 0, i64 %j, i64 %i
+  %lv = load i64, ptr %arrayidx5
+  %add = add nsw i64 %lv, %k
+  store i64 %add, ptr %arrayidx5
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond = icmp ne i64 %i, %j
+  br i1 %exitcond, label %for2, label %for1.inc10
+
+for1.inc10:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond26 = icmp eq i64 %i, 99
+  br i1 %exitcond26, label %for.end12, label %for1.header
+
+for.end12:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll b/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll
index 0a1d1e5250799..ff88375e31856 100644
--- a/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll
+++ b/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll
@@ -1,130 +1,47 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
-; RUN:     -S -debug 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
 
 @A = common global [100 x [100 x i64]] zeroinitializer
 @N = dso_local local_unnamed_addr global i64 100, align 8
 
-
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;j<i;j++)
-;;      A[j][i] = A[j][i]+k;
-
-;; Inner loop induction variable exit condition depends on the
-;; outer loop induction variable, i.e., triangular loops.
-; CHECK: Loop structure not understood by pass
-; CHECK: Not interchanging loops. Cannot prove legality.
-
-define void @interchange_01(i64 %k) {
-entry:
-  br label %for1.header
-
-for1.header:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for1.inc10 ]
-  br label %for2
-
-for2:
-  %j = phi i64 [ %j.next, %for2 ], [ 0, %for1.header ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], ptr @A, i64 0, i64 %j, i64 %i
-  %lv = load i64, ptr %arrayidx5
-  %add = add nsw i64 %lv, %k
-  store i64 %add, ptr %arrayidx5
-  %j.next = add nuw nsw i64 %j, 1
-  %exitcond = icmp eq i64 %j, %i
-  br i1 %exitcond, label %for1.inc10, label %for2
-
-for1.inc10:
-  %i.next = add nuw nsw i64 %i, 1
-  %exitcond26 = icmp eq i64 %i, 99
-  br i1 %exitcond26, label %for.end12, label %for1.header
-
-for.end12:
-  ret void
-}
-
-
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;j+i<100;j++)
-;;      A[j][i] = A[j][i]+k;
-
-;; Inner loop induction variable exit condition depends on the
-;; outer loop induction variable, i.e., triangular loops.
-; CHECK: Loop structure not understood by pass
-; CHECK: Not interchanging loops. Cannot prove legality.
-
-define void @interchange_02(i64 %k) {
-entry:
-  br label %for1.header
-
-for1.header:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for1.inc10 ]
-  br label %for2
-
-for2:
-  %j = phi i64 [ %j.next, %for2 ], [ 0, %for1.header ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], ptr @A, i64 0, i64 %j, i64 %i
-  %lv = load i64, ptr %arrayidx5
-  %add = add nsw i64 %lv, %k
-  store i64 %add, ptr %arrayidx5
-  %0 = add nuw nsw i64 %j, %i
-  %j.next = add nuw nsw i64 %j, 1
-  %exitcond = icmp eq i64 %0, 100
-  br i1 %exitcond, label %for1.inc10, label %for2
-
-for1.inc10:
-  %i.next = add nuw nsw i64 %i, 1
-  %exitcond26 = icmp eq i64 %i, 99
-  br i1 %exitcond26, label %for.end12, label %for1.header
-
-for.end12:
-  ret void
-}
-
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;i>j;j++)
-;;      A[j][i] = A[j][i]+k;
-
-;; Inner loop induction variable exit condition depends on the
-;; outer loop induction variable, i.e., triangular loops.
-; CHECK: Loop structure not understood by pass
-; CHECK: Not interchanging loops. Cannot prove legality.
-
-define void @interchange_03(i64 %k) {
-entry:
-  br label %for1.header
-
-for1.header:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for1.inc10 ]
-  br label %for2
-
-for2:
-  %j = phi i64 [ %j.next, %for2 ], [ 0, %for1.header ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], ptr @A, i64 0, i64 %j, i64 %i
-  %lv = load i64, ptr %arrayidx5
-  %add = add nsw i64 %lv, %k
-  store i64 %add, ptr %arrayidx5
-  %j.next = add nuw nsw i64 %j, 1
-  %exitcond = icmp ne i64 %i, %j
-  br i1 %exitcond, label %for2, label %for1.inc10
-
-for1.inc10:
-  %i.next = add nuw nsw i64 %i, 1
-  %exitcond26 = icmp eq i64 %i, 99
-  br i1 %exitcond26, label %for.end12, label %for1.header
-
-for.end12:
-  ret void
-}
-
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;N>j;j++)
-;;      A[j][i] = A[j][i]+k;
-
-;; Inner loop induction variable exit condition depends on
-;; an outer loop invariant, can do interchange.
-; CHECK: Loops interchanged
-
+; Inner loop induction variable exit condition depends on
+; an outer loop invariant, can do interchange.
+;
 define void @interchange_04(i64 %k) {
+; CHECK-LABEL: define void @interchange_04(
+; CHECK-SAME: i64 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @N, align 4
+; CHECK-NEXT:    br label %[[FOR2_PREHEADER:.*]]
+; CHECK:       [[FOR1_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR1_HEADER:.*]]
+; CHECK:       [[FOR1_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[FOR1_INC10:.*]] ], [ 0, %[[FOR1_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR2_SPLIT1:.*]]
+; CHECK:       [[FOR2_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR2:.*]]
+; CHECK:       [[FOR2]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[TMP1:%.*]], %[[FOR2_SPLIT:.*]] ], [ 0, %[[FOR2_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR1_HEADER_PREHEADER]]
+; CHECK:       [[FOR2_SPLIT1]]:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x [100 x i64]], ptr @A, i64 0, i64 [[J]], i64 [[I]]
+; CHECK-NEXT:    [[LV:%.*]] = load i64, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[LV]], [[K]]
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[J_NEXT:%.*]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[TMP0]], [[J]]
+; CHECK-NEXT:    br label %[[FOR1_INC10]]
+; CHECK:       [[FOR2_SPLIT]]:
+; CHECK-NEXT:    [[TMP1]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], [[J]]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[FOR2]], label %[[FOR_END12:.*]]
+; CHECK:       [[FOR1_INC10]]:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i64 [[I]], 99
+; CHECK-NEXT:    br i1 [[EXITCOND26]], label %[[FOR2_SPLIT]], label %[[FOR1_HEADER]]
+; CHECK:       [[FOR_END12]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load i64, ptr @N, align 4
   br label %for1.header
diff --git a/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll b/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
index 11e59c6db9f32..bad84224d445a 100644
--- a/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
+++ b/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
@@ -1,19 +1,61 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
-; RUN:     -S -debug 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
 
 @a = common global i32 0, align 4
 @d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4
 
-;; After interchanging the innermost and the middle loop, we should not continue
-;; doing interchange for the (new) middle loop and the outermost loop, because of
-;; values defined in the new innermost loop not available in the exiting block of
-;; the entire loop nest.
-; CHECK: Loops are legal to interchange
-; CHECK: Loops interchanged.
-; CHECK: Found unsupported PHI nodes in inner loop latch.
-; CHECK: Not interchanging loops. Cannot prove legality.
+; After interchanging the innermost and the middle loop, we should not continue
+; doing interchange for the (new) middle loop and the outermost loop, because of
+; values defined in the new innermost loop not available in the exiting block of
+; the entire loop nest.
+;
 define void @innermost_latch_uses_values_in_middle_header() {
+; CHECK-LABEL: define void @innermost_latch_uses_values_in_middle_header() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    [[B:%.*]] = add i32 80, 1
+; CHECK-NEXT:    br label %[[OUTERMOST_HEADER:.*]]
+; CHECK:       [[OUTERMOST_HEADER]]:
+; CHECK-NEXT:    [[INDVAR_OUTERMOST:%.*]] = phi i32 [ 10, %[[ENTRY]] ], [ [[INDVAR_OUTERMOST_NEXT:%.*]], %[[OUTERMOST_LATCH:.*]] ]
+; CHECK-NEXT:    [[TOBOOL71_I:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL71_I]], label %[[INNERMOST_HEADER_PREHEADER:.*]], label %[[OUTERMOST_LATCH]]
+; CHECK:       [[MIDDLE_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[MIDDLE_HEADER:.*]]
+; CHECK:       [[MIDDLE_HEADER]]:
+; CHECK-NEXT:    [[INDVAR_MIDDLE:%.*]] = phi i64 [ [[INDVAR_MIDDLE_NEXT:%.*]], %[[MIDDLE_LATCH:.*]] ], [ 4, %[[MIDDLE_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVAR_MIDDLE_WIDE:%.*]] = zext i32 [[B]] to i64
+; CHECK-NEXT:    br label %[[INNERMOST_BODY:.*]]
+; CHECK:       [[INNERMOST_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[INNERMOST_HEADER:.*]]
+; CHECK:       [[INNERMOST_HEADER]]:
+; CHECK-NEXT:    [[INDVAR_INNERMOST:%.*]] = phi i64 [ [[TMP1:%.*]], %[[INNERMOST_LATCH_SPLIT:.*]] ], [ 4, %[[INNERMOST_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[MIDDLE_HEADER_PREHEADER]]
+; CHECK:       [[INNERMOST_BODY]]:
+; CHECK-NEXT:    [[ARRAYIDX9_I:%.*]] = getelementptr inbounds [1 x [6 x i32]], ptr @d, i64 0, i64 [[INDVAR_INNERMOST]], i64 [[INDVAR_MIDDLE]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX9_I]], align 4
+; CHECK-NEXT:    br label %[[INNERMOST_LATCH:.*]]
+; CHECK:       [[INNERMOST_LATCH]]:
+; CHECK-NEXT:    [[INDVAR_INNERMOST_NEXT:%.*]] = add nsw i64 [[INDVAR_INNERMOST]], 1
+; CHECK-NEXT:    [[TOBOOL5_I:%.*]] = icmp eq i64 [[INDVAR_INNERMOST_NEXT]], [[INDVAR_MIDDLE_WIDE]]
+; CHECK-NEXT:    br label %[[MIDDLE_LATCH]]
+; CHECK:       [[INNERMOST_LATCH_SPLIT]]:
+; CHECK-NEXT:    [[INDVAR_MIDDLE_WIDE_LCSSA:%.*]] = phi i64 [ [[INDVAR_MIDDLE_WIDE]], %[[MIDDLE_LATCH]] ]
+; CHECK-NEXT:    [[TMP1]] = add nsw i64 [[INDVAR_INNERMOST]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], [[INDVAR_MIDDLE_WIDE_LCSSA]]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[OUTERMOST_LATCH_LOOPEXIT:.*]], label %[[INNERMOST_HEADER]]
+; CHECK:       [[MIDDLE_LATCH]]:
+; CHECK-NEXT:    [[INDVAR_MIDDLE_NEXT]] = add nsw i64 [[INDVAR_MIDDLE]], -1
+; CHECK-NEXT:    [[TOBOOL2_I:%.*]] = icmp eq i64 [[INDVAR_MIDDLE_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL2_I]], label %[[INNERMOST_LATCH_SPLIT]], label %[[MIDDLE_HEADER]]
+; CHECK:       [[OUTERMOST_LATCH_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[OUTERMOST_LATCH]]
+; CHECK:       [[OUTERMOST_LATCH]]:
+; CHECK-NEXT:    [[INDVAR_OUTERMOST_NEXT]] = add nsw i32 [[INDVAR_OUTERMOST]], -5
+; CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp eq i32 [[INDVAR_OUTERMOST_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_I]], label %[[OUTERMOST_EXIT:.*]], label %[[OUTERMOST_HEADER]]
+; CHECK:       [[OUTERMOST_EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load i32, ptr @a, align 4
   %b = add i32 80, 1
diff --git a/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll b/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
index 010118370bc01..6daf61a4ec007 100644
--- a/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
@@ -1,6 +1,5 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
-; RUN:     -S -debug 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -8,28 +7,70 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @B = common global [100 x i32] zeroinitializer
 @C = common global [100 x [100 x i32]] zeroinitializer
 @D = common global [100 x [100 x [100 x i32]]] zeroinitializer
-
-;; Test that a flow dependency in outer loop doesn't prevent interchange in
-;; loops i and j.
-;;
-;;  for (int k = 0; k < 100; ++k) {
-;;    T[k] = fn1();
-;;    for (int i = 0; i < 1000; ++i)
-;;      for(int j = 1; j < 1000; ++j)
-;;        Arr[j][i] = Arr[j][i]+k;
-;;    fn2(T[k]);
-;;  }
-
-; CHECK: Processing InnerLoopId = 2 and OuterLoopId = 1
-; CHECK: Loops interchanged.
-
-; CHECK: Processing InnerLoopId = 1 and OuterLoopId = 0
-; CHECK: Not interchanging loops. Cannot prove legality.
-
 @T = internal global [100 x double] zeroinitializer, align 4
 @Arr = internal global [1000 x [1000 x i32]] zeroinitializer, align 4
 
+; Test that a flow dependency in outer loop doesn't prevent interchange in
+; loops i and j.
+;
+;  for (int k = 0; k < 100; ++k) {
+;    T[k] = fn1();
+;    for (int i = 0; i < 1000; ++i)
+;      for(int j = 1; j < 1000; ++j)
+;        Arr[j][i] = Arr[j][i]+k;
+;    fn2(T[k]);
+;  }
+;
+; So, loops InnerLoopId = 2 and OuterLoopId = 1 should be interchanged,
+; but not InnerLoopId = 1 and OuterLoopId = 0.
+;
 define void @interchange_09(i32 %k) {
+; CHECK-LABEL: define void @interchange_09(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV45:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT46:%.*]], %[[FOR_COND_CLEANUP4:.*]] ]
+; CHECK-NEXT:    [[CALL:%.*]] = call double @fn1()
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x double], ptr @T, i64 0, i64 [[INDVARS_IV45]]
+; CHECK-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    br label %[[FOR_BODY9_PREHEADER:.*]]
+; CHECK:       [[FOR_COND6_PREHEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND6_PREHEADER:.*]]
+; CHECK:       [[FOR_COND6_PREHEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV42:%.*]] = phi i64 [ [[INDVARS_IV_NEXT43:%.*]], %[[FOR_COND_CLEANUP8:.*]] ], [ 0, %[[FOR_COND6_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY9_SPLIT1:.*]]
+; CHECK:       [[FOR_BODY9_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY9:.*]]
+; CHECK:       [[FOR_COND_CLEANUP4]]:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    call void @fn2(double [[TMP]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT46]] = add nuw nsw i64 [[INDVARS_IV45]], 1
+; CHECK-NEXT:    [[EXITCOND47:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT46]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND47]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP8]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT43]] = add nuw nsw i64 [[INDVARS_IV42]], 1
+; CHECK-NEXT:    [[EXITCOND44:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT43]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND44]], label %[[FOR_COND6_PREHEADER]], label %[[FOR_BODY9_SPLIT:.*]]
+; CHECK:       [[FOR_BODY9]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0:%.*]], %[[FOR_BODY9_SPLIT]] ], [ 1, %[[FOR_BODY9_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_COND6_PREHEADER_PREHEADER]]
+; CHECK:       [[FOR_BODY9_SPLIT1]]:
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [1000 x [1000 x i32]], ptr @Arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV42]]
+; CHECK-NEXT:    [[T1:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = trunc i64 [[INDVARS_IV45]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[T1]], [[T2]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 1000
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP8]]
+; CHECK:       [[FOR_BODY9_SPLIT]]:
+; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 1000
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_BODY9]], label %[[FOR_COND_CLEANUP4]]
+;
 entry:
   br label %for.body
 
@@ -62,9 +103,9 @@ for.cond.cleanup8:                                ; preds = %for.body9
 for.body9:                                        ; preds = %for.body9, %for.cond6.preheader
   %indvars.iv = phi i64 [ 1, %for.cond6.preheader ], [ %indvars.iv.next, %for.body9 ]
   %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], ptr @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
-  %tmp1 = load i32, ptr %arrayidx13, align 4
-  %tmp2 = trunc i64 %indvars.iv45 to i32
-  %add = add nsw i32 %tmp1, %tmp2
+  %t1 = load i32, ptr %arrayidx13, align 4
+  %t2 = trunc i64 %indvars.iv45 to i32
+  %add = add nsw i32 %t1, %t2
   store i32 %add, ptr %arrayidx13, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp ne i64 %indvars.iv.next, 1000
diff --git a/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll b/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll
index 718e9a8bbd3f5..9e8e30d670c51 100644
--- a/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll
@@ -1,22 +1,42 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes='loop(loop-interchange),simplifycfg' -cache-line-size=64 -simplifycfg-require-and-preserve-domtree=1 -pass-remarks-output=%t \
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes='loop(loop-interchange),simplifycfg' -simplifycfg-require-and-preserve-domtree=1 \
 ; RUN:     -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange -stats -S 2>&1 \
 ; RUN:     | FileCheck -check-prefix=STATS %s
-; RUN: FileCheck -input-file %t %s
-
 
 ; no_deps_interchange just accesses a single nested array and can be interchange.
-; CHECK:      Name:       Interchanged
-; CHECK-NEXT: Function:   no_deps_interchange
-define i32 @no_deps_interchange(ptr nocapture %Arr) local_unnamed_addr #0 {
+;
+define i32 @no_deps_interchange(ptr nocapture %Arr) {
+; STATS-LABEL: define i32 @no_deps_interchange(
+; STATS-SAME: ptr nocapture [[ARR:%.*]]) {
+; STATS-NEXT:  [[ENTRY:.*]]:
+; STATS-NEXT:    br label %[[FOR2:.*]]
+; STATS:       [[FOR1_HEADER:.*]]:
+; STATS-NEXT:    [[INDVARS_IV19:%.*]] = phi i64 [ [[INDVARS_IV_NEXT20:%.*]], %[[FOR1_HEADER]] ], [ 0, %[[FOR2]] ]
+; STATS-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[ARR]], i64 [[INDVARS_IV:%.*]], i64 [[INDVARS_IV19]]
+; STATS-NEXT:    store i32 0, ptr [[ARRAYIDX6]], align 4
+; STATS-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; STATS-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 1024
+; STATS-NEXT:    [[INDVARS_IV_NEXT20]] = add nuw nsw i64 [[INDVARS_IV19]], 1
+; STATS-NEXT:    [[EXITCOND21:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT20]], 1024
+; STATS-NEXT:    br i1 [[EXITCOND21]], label %[[FOR1_HEADER]], label %[[FOR2_SPLIT:.*]]
+; STATS:       [[FOR2]]:
+; STATS-NEXT:    [[INDVARS_IV]] = phi i64 [ [[TMP0:%.*]], %[[FOR2_SPLIT]] ], [ 0, %[[ENTRY]] ]
+; STATS-NEXT:    br label %[[FOR1_HEADER]]
+; STATS:       [[FOR2_SPLIT]]:
+; STATS-NEXT:    [[TMP0]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; STATS-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 1024
+; STATS-NEXT:    br i1 [[TMP1]], label %[[FOR2]], label %[[EXIT:.*]]
+; STATS:       [[EXIT]]:
+; STATS-NEXT:    ret i32 0
+;
 entry:
   br label %for1.header
 
-for1.header:                                         ; preds = %entry, %for1.inc
+for1.header:
   %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ]
   br label %for2
 
-for2:                                        ; preds = %for1.header, %for2
+for2:
   %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ]
   %arrayidx6 = getelementptr inbounds [1024 x i32], ptr %Arr, i64 %indvars.iv, i64 %indvars.iv19
   store i32 0, ptr %arrayidx6, align 4
@@ -29,23 +49,42 @@ for1.inc:
   %exitcond21 = icmp ne i64 %indvars.iv.next20, 1024
   br i1 %exitcond21, label %for1.header, label %exit
 
-exit:                                 ; preds = %for1.inc
+exit:
   ret i32 0
 
 }
 
 ; No memory access using any induction variables, interchanging not beneficial.
-; CHECK:      Name:        InterchangeNotProfitable
-; CHECK-NEXT: Function:    no_mem_instrs
+;
 define i32 @no_mem_instrs(ptr %ptr) {
+; STATS-LABEL: define i32 @no_mem_instrs(
+; STATS-SAME: ptr [[PTR:%.*]]) {
+; STATS-NEXT:  [[ENTRY:.*]]:
+; STATS-NEXT:    br label %[[FOR1_HEADER:.*]]
+; STATS:       [[FOR1_HEADER]]:
+; STATS-NEXT:    [[INDVARS_IV19:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT20:%.*]], %[[FOR1_INC:.*]] ]
+; STATS-NEXT:    br label %[[FOR2:.*]]
+; STATS:       [[FOR2]]:
+; STATS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR1_HEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR2]] ]
+; STATS-NEXT:    store i64 [[INDVARS_IV]], ptr [[PTR]], align 4
+; STATS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; STATS-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 1024
+; STATS-NEXT:    br i1 [[EXITCOND]], label %[[FOR2]], label %[[FOR1_INC]]
+; STATS:       [[FOR1_INC]]:
+; STATS-NEXT:    [[INDVARS_IV_NEXT20]] = add nuw nsw i64 [[INDVARS_IV19]], 1
+; STATS-NEXT:    [[EXITCOND21:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT20]], 1024
+; STATS-NEXT:    br i1 [[EXITCOND21]], label %[[FOR1_HEADER]], label %[[EXIT:.*]]
+; STATS:       [[EXIT]]:
+; STATS-NEXT:    ret i32 0
+;
 entry:
   br label %for1.header
 
-for1.header:                                         ; preds = %entry, %for1.inc
+for1.header:
   %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ]
   br label %for2
 
-for2:                                        ; preds = %for1.header, %for2
+for2:
   %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ]
   store i64 %indvars.iv, ptr %ptr, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
@@ -60,7 +99,3 @@ for1.inc:
 exit:                                 ; preds = %for1.inc
   ret i32 0
 }
-
-
-; Check stats, we interchanged 1 out of 3 loops.
-; STATS: 1 loop-interchange - Number of loops interchanged
diff --git a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll
index 18832933b8841..6be86f1a8fdcf 100644
--- a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll
@@ -1,24 +1,66 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
-; RUN:     -S -debug 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @D = common global [100 x [100 x [100 x i32]]] zeroinitializer
 
-;; Test for interchange in loop nest greater than 2.
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;j<100;j++)
-;;      for(int k=0;k<100;k++)
-;;        D[k][j][i] = D[k][j][i]+t;
-
-; CHECK: Processing InnerLoopId = 2 and OuterLoopId = 1
-; CHECK: Loops interchanged.
-
-; CHECK: Processing InnerLoopId = 1 and OuterLoopId = 0
-; CHECK: Loops interchanged.
-
+; Test for interchange in loop nest greater than 2.
+;  for(int i=0;i<100;i++)
+;    for(int j=0;j<100;j++)
+;      for(int k=0;k<100;k++)
+;        D[k][j][i] = D[k][j][i]+t;
+;
+; Loops InnerLoopId = 2 and OuterLoopId = 1 should be interchanged, and then
+; also InnerLoopId = 1 and OuterLoopId = 0.
+;
 define void @interchange_08(i32 %t){
+; CHECK-LABEL: define void @interchange_08(
+; CHECK-SAME: i32 [[T:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_BODY6_PREHEADER:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT:    [[I_028:%.*]] = phi i64 [ [[INC16:%.*]], %[[FOR_INC15:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY6_SPLIT1:.*]]
+; CHECK:       [[FOR_COND4_PREHEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND4_PREHEADER:.*]]
+; CHECK:       [[FOR_COND4_PREHEADER]]:
+; CHECK-NEXT:    [[J_027:%.*]] = phi i64 [ [[TMP3:%.*]], %[[FOR_INC12_SPLIT:.*]] ], [ 0, %[[FOR_COND4_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER_PREHEADER]]
+; CHECK:       [[FOR_BODY6_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY6:.*]]
+; CHECK:       [[FOR_BODY6]]:
+; CHECK-NEXT:    [[K_026:%.*]] = phi i64 [ [[TMP1:%.*]], %[[FOR_BODY6_SPLIT:.*]] ], [ 0, %[[FOR_BODY6_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_COND4_PREHEADER_PREHEADER]]
+; CHECK:       [[FOR_BODY6_SPLIT1]]:
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [100 x [100 x [100 x i32]]], ptr @D, i64 0, i64 [[K_026]], i64 [[J_027]], i64 [[I_028]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[T]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i64 [[K_026]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], 100
+; CHECK-NEXT:    br label %[[FOR_INC12:.*]]
+; CHECK:       [[FOR_BODY6_SPLIT]]:
+; CHECK-NEXT:    [[TMP1]] = add nuw nsw i64 [[K_026]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 100
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[FOR_END17:.*]], label %[[FOR_BODY6]]
+; CHECK:       [[FOR_INC12]]:
+; CHECK-NEXT:    [[INC13:%.*]] = add nuw nsw i64 [[J_027]], 1
+; CHECK-NEXT:    [[EXITCOND29:%.*]] = icmp eq i64 [[INC13]], 100
+; CHECK-NEXT:    br label %[[FOR_INC15]]
+; CHECK:       [[FOR_INC12_SPLIT]]:
+; CHECK-NEXT:    [[TMP3]] = add nuw nsw i64 [[J_027]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 100
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[FOR_BODY6_SPLIT]], label %[[FOR_COND4_PREHEADER]]
+; CHECK:       [[FOR_INC15]]:
+; CHECK-NEXT:    [[INC16]] = add nuw nsw i64 [[I_028]], 1
+; CHECK-NEXT:    [[EXITCOND30:%.*]] = icmp eq i64 [[INC16]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND30]], label %[[FOR_INC12_SPLIT]], label %[[FOR_COND1_PREHEADER]]
+; CHECK:       [[FOR_END17]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.cond1.preheader
 
diff --git a/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll b/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
index ffb9f106c2880..529ebeb646690 100644
--- a/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
+++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
@@ -1,20 +1,42 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
-; RUN:     -S -debug 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @A = common global [100 x [100 x i32]] zeroinitializer
 @B = common global [100 x i32] zeroinitializer
 
-;; Loops should not be interchanged in this case as it is not legal due to dependency.
-;;  for(int j=0;j<99;j++)
-;;   for(int i=0;i<99;i++)
-;;       A[j][i+1] = A[j+1][i]+k;
-
-; CHECK: Not interchanging loops. Cannot prove legality.
-
+; Loops should not be interchanged in this case as it is not legal due to dependency.
+;
+;  for(int j=0;j<99;j++)
+;   for(int i=0;i<99;i++)
+;       A[j][i+1] = A[j+1][i]+k;
+;
 define void @interchange_04(i32 %k){
+; CHECK-LABEL: define void @interchange_04(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV23:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT24:%.*]], %[[FOR_INC12:.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1
+; CHECK-NEXT:    br label %[[FOR_BODY3:.*]]
+; CHECK:       [[FOR_BODY3]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3]] ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x [100 x i32]], ptr @A, i64 0, i64 [[INDVARS_IV_NEXT24]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP0]], [[K]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [100 x [100 x i32]], ptr @A, i64 0, i64 [[INDVARS_IV23]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    store i32 [[ADD6]], ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 99
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_INC12]], label %[[FOR_BODY3]]
+; CHECK:       [[FOR_INC12]]:
+; CHECK-NEXT:    [[EXITCOND25:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT24]], 99
+; CHECK-NEXT:    br i1 [[EXITCOND25]], label %[[FOR_END14:.*]], label %[[FOR_COND1_PREHEADER]]
+; CHECK:       [[FOR_END14]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.cond1.preheader
 
diff --git a/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll b/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
index ada8fb3f33eb5..6a7f43d19eaf2 100644
--- a/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
+++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
@@ -1,24 +1,61 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
-; RUN:     -S -debug 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @D = common global [100 x [100 x [100 x i32]]] zeroinitializer
 
-;; Test for interchange in loop nest greater than 2.
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;j<100;j++)
-;;      for(int k=0;k<100;k++)
-;;        D[i][k][j] = D[i][k][j]+t;
-
-; CHECK: Processing InnerLoopId = 2 and OuterLoopId = 1
-; CHECK: Loops interchanged.
-
-; CHECK: Processing InnerLoopId = 1 and OuterLoopId = 0
-; CHECK: Interchanging loops not profitable.
-
+; Test for interchange in loop nest greater than 2.
+;
+;  for(int i=0;i<100;i++)
+;    for(int j=0;j<100;j++)
+;      for(int k=0;k<100;k++)
+;        D[i][k][j] = D[i][k][j]+t;
+;
+; Loops InnerLoopId = 2 and OuterLoopId = 1 should be interchanged, but not
+; loops InnerLoopId = 1 and OuterLoopId = 0 as that is not profitable.
+;
 define void @interchange_08(i32 %t){
+; CHECK-LABEL: define void @interchange_08(
+; CHECK-SAME: i32 [[T:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT:    [[I_028:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC16:%.*]], %[[FOR_INC15:.*]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY6_PREHEADER:.*]]
+; CHECK:       [[FOR_COND4_PREHEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND4_PREHEADER:.*]]
+; CHECK:       [[FOR_COND4_PREHEADER]]:
+; CHECK-NEXT:    [[J_027:%.*]] = phi i64 [ [[INC13:%.*]], %[[FOR_INC12:.*]] ], [ 0, %[[FOR_COND4_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY6_SPLIT1:.*]]
+; CHECK:       [[FOR_BODY6_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY6:.*]]
+; CHECK:       [[FOR_BODY6]]:
+; CHECK-NEXT:    [[K_026:%.*]] = phi i64 [ [[TMP1:%.*]], %[[FOR_BODY6_SPLIT:.*]] ], [ 0, %[[FOR_BODY6_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_COND4_PREHEADER_PREHEADER]]
+; CHECK:       [[FOR_BODY6_SPLIT1]]:
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [100 x [100 x [100 x i32]]], ptr @D, i32 0, i64 [[I_028]], i64 [[K_026]], i64 [[J_027]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[T]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i64 [[K_026]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], 100
+; CHECK-NEXT:    br label %[[FOR_INC12]]
+; CHECK:       [[FOR_BODY6_SPLIT]]:
+; CHECK-NEXT:    [[TMP1]] = add nuw nsw i64 [[K_026]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 100
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[FOR_INC15]], label %[[FOR_BODY6]]
+; CHECK:       [[FOR_INC12]]:
+; CHECK-NEXT:    [[INC13]] = add nuw nsw i64 [[J_027]], 1
+; CHECK-NEXT:    [[EXITCOND29:%.*]] = icmp eq i64 [[INC13]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND29]], label %[[FOR_BODY6_SPLIT]], label %[[FOR_COND4_PREHEADER]]
+; CHECK:       [[FOR_INC15]]:
+; CHECK-NEXT:    [[INC16]] = add nuw nsw i64 [[I_028]], 1
+; CHECK-NEXT:    [[EXITCOND30:%.*]] = icmp eq i64 [[INC16]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND30]], label %[[FOR_END17:.*]], label %[[FOR_COND1_PREHEADER]]
+; CHECK:       [[FOR_END17]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.cond1.preheader
 
diff --git a/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll b/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
index b8e569afe8645..d8b47682bbadc 100644
--- a/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
+++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
@@ -1,6 +1,5 @@
-; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
-; RUN:     -S -debug 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -9,16 +8,51 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @C = common global [100 x [100 x i32]] zeroinitializer
 @D = common global [100 x [100 x [100 x i32]]] zeroinitializer
 
-;; Loops not tightly nested are not interchanged
-;;  for(int j=0;j<N;j++) {
-;;    B[j] = j+k;
-;;    for(int i=0;i<N;i++)
-;;      A[j][i] = A[j][i]+B[j];
-;;  }
-
-; CHECK: Not interchanging loops. Cannot prove legality.
-
+; Loops not tightly nested are not interchanged
+;
+;  for(int j=0;j<N;j++) {
+;    B[j] = j+k;
+;    for(int i=0;i<N;i++)
+;      A[j][i] = A[j][i]+B[j];
+;  }
+;
 define void @interchange_05(i32 %k, i32 %N){
+; CHECK-LABEL: define void @interchange_05(
+; CHECK-SAME: i32 [[K:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP30]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END17:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[K]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV32:%.*]] = phi i64 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT33:%.*]], %[[FOR_INC15:.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV32]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr @B, i64 0, i64 [[INDVARS_IV32]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label %[[FOR_BODY3:.*]]
+; CHECK:       [[FOR_BODY3]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [100 x [100 x i32]], ptr @A, i64 0, i64 [[INDVARS_IV32]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_INC15]], label %[[FOR_BODY3]]
+; CHECK:       [[FOR_INC15]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT33]] = add nuw nsw i64 [[INDVARS_IV32]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV32]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label %[[FOR_END17_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END17_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END17]]
+; CHECK:       [[FOR_END17]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp30 = icmp sgt i32 %N, 0
   br i1 %cmp30, label %for.body.lr.ph, label %for.end17
@@ -59,16 +93,46 @@ for.end17:
 
 declare void @foo(...) readnone
 
-;; Loops not tightly nested are not interchanged
-;;  for(int j=0;j<N;j++) {
-;;    foo();
-;;    for(int i=2;i<N;i++)
-;;      A[j][i] = A[j][i]+k;
-;;  }
-
-; CHECK: Not interchanging loops. Cannot prove legality.
-
+; Loops not tightly nested are not interchanged
+;  for(int j=0;j<N;j++) {
+;    foo();
+;    for(int i=2;i<N;i++)
+;      A[j][i] = A[j][i]+k;
+;  }
+;
 define void @interchange_06(i32 %k, i32 %N) {
+; CHECK-LABEL: define void @interchange_06(
+; CHECK-SAME: i32 [[K:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP22:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP22]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END12:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV24:%.*]] = phi i64 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT25:%.*]], %[[FOR_INC10:.*]] ]
+; CHECK-NEXT:    tail call void (...) @foo()
+; CHECK-NEXT:    br label %[[FOR_BODY3:.*]]
+; CHECK:       [[FOR_BODY3]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3]] ], [ 2, %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x [100 x i32]], ptr @A, i64 0, i64 [[INDVARS_IV24]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[K]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_INC10]], label %[[FOR_BODY3]]
+; CHECK:       [[FOR_INC10]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT25]] = add nuw nsw i64 [[INDVARS_IV24]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV26:%.*]] = trunc i64 [[INDVARS_IV24]] to i32
+; CHECK-NEXT:    [[EXITCOND27:%.*]] = icmp eq i32 [[LFTR_WIDEIV26]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND27]], label %[[FOR_END12_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END12_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END12]]
+; CHECK:       [[FOR_END12]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp22 = icmp sgt i32 %N, 0
   br i1 %cmp22, label %for.body.lr.ph, label %for.end12
@@ -103,11 +167,41 @@ for.end12:
   ret void
 }
 
-;; The following Loop is not considered tightly nested and is not interchanged.
-;; The outer loop header does not branch to the inner loop preheader, or the
-;; inner loop header, or the outer loop latch.
-; CHECK: Not interchanging loops. Cannot prove legality.
+; The following Loop is not considered tightly nested and is not interchanged.
+; The outer loop header does not branch to the inner loop preheader, or the
+; inner loop header, or the outer loop latch.
+;
 define void @interchange_07(i32 %k, i32 %N, i64 %ny) {
+; CHECK-LABEL: define void @interchange_07(
+; CHECK-SAME: i32 [[K:%.*]], i32 [[N:%.*]], i64 [[NY:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR1_HEADER:.*]]
+; CHECK:       [[FOR1_HEADER]]:
+; CHECK-NEXT:    [[J23:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[J_NEXT24:%.*]], %[[FOR1_INC10:.*]] ]
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp slt i64 0, [[NY]]
+; CHECK-NEXT:    br label %[[SINGLESUCC:.*]]
+; CHECK:       [[SINGLESUCC]]:
+; CHECK-NEXT:    br i1 [[CMP21]], label %[[PREHEADER_J:.*]], label %[[FOR1_INC10]]
+; CHECK:       [[PREHEADER_J]]:
+; CHECK-NEXT:    br label %[[FOR2:.*]]
+; CHECK:       [[FOR2]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[FOR2]] ], [ 0, %[[PREHEADER_J]] ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x [100 x i32]], ptr @A, i64 0, i64 [[J]], i64 [[J23]]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[LV]], [[K]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[J]], 99
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR1_INC10_LOOPEXIT:.*]], label %[[FOR2]]
+; CHECK:       [[FOR1_INC10_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR1_INC10]]
+; CHECK:       [[FOR1_INC10]]:
+; CHECK-NEXT:    [[J_NEXT24]] = add nuw nsw i64 [[J23]], 1
+; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i64 [[J23]], 99
+; CHECK-NEXT:    br i1 [[EXITCOND26]], label %[[FOR_END12:.*]], label %[[FOR1_HEADER]]
+; CHECK:       [[FOR_END12]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for1.header
 
diff --git a/llvm/test/Transforms/LoopInterchange/unique-dep-matrix.ll b/llvm/test/Transforms/LoopInterchange/unique-dep-matrix.ll
new file mode 100644
index 0000000000000..6943e39cf163e
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/unique-dep-matrix.ll
@@ -0,0 +1,45 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-interchange -S -debug 2>&1 | FileCheck %s
+
+; CHECK:       Dependency matrix before interchange:
+; CHECK-NEXT:  I I
+; CHECK-NEXT:  = S
+; CHECK-NEXT:  < S
+; CHECK-NEXT:  Processing InnerLoopId
+
+; This example is taken from github issue #54176
+;
+define void @foo(i32 noundef %n, i32 noundef %m, ptr nocapture noundef %aa, ptr nocapture noundef readonly %bb, ptr nocapture noundef writeonly %cc) {
+entry:
+  %arrayidx7 = getelementptr inbounds i8, ptr %aa, i64 512
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv32 = phi i64 [ 1, %entry ], [ %indvars.iv.next33, %for.cond.cleanup3 ]
+  %0 = add nsw i64 %indvars.iv32, -1
+  %arrayidx9 = getelementptr inbounds [128 x float], ptr %arrayidx7, i64 0, i64 %0
+  %arrayidx12 = getelementptr inbounds [128 x float], ptr %arrayidx7, i64 0, i64 %indvars.iv32
+  br label %for.body4
+
+for.cond.cleanup:
+  ret void
+
+for.cond.cleanup3:
+  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+  %exitcond36 = icmp ne i64 %indvars.iv.next33, 128
+  br i1 %exitcond36, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.body4:
+  %indvars.iv = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %arrayidx6 = getelementptr inbounds [128 x float], ptr %bb, i64 %indvars.iv, i64 %indvars.iv32
+  %1 = load float, ptr %arrayidx6, align 4
+  %2 = load float, ptr %arrayidx9, align 4
+  %add = fadd fast float %2, %1
+  store float %add, ptr %arrayidx9, align 4
+  %3 = load float, ptr %arrayidx12, align 4
+  %arrayidx16 = getelementptr inbounds [128 x float], ptr %cc, i64 %indvars.iv, i64 %indvars.iv32
+  store float %3, ptr %arrayidx16, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body4, label %for.cond.cleanup3
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index dd3d4215a3f63..e6d93ea192e56 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
 ; %var4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
+; COST:        Cost of 5 for VF 2: profitable to scalarize   %var4 = udiv i64 %var2, %var3
 ;
 ;
 define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 28830f9bcd11e..aa78113ebaa48 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -11,9 +11,14 @@
 ; CM: LV: Found uniform instruction:   %a = extractvalue { i64, i64 } %sv, 0
 ; CM: LV: Found uniform instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
+; Ensure the extractvalue + add instructions are hoisted out
+; CM: vector.ph:
+; CM:  CLONE ir<%a> = extractvalue ir<%sv>
+; CM:  CLONE ir<%b> = extractvalue ir<%sv>
+; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  Successor(s): vector loop
+
 ; CM: LV: Scalar loop costs: 5.
-; CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
 ; Check that the extractvalue operands are actually free in vector code.
 
@@ -58,12 +63,14 @@ exit:
 ; Similar to the test case above, but checks getVectorCallCost as well.
 declare float @powf(float, float) readnone nounwind
 
-; CM: LV: Found uniform instruction:   %a = extractvalue { float, float } %sv, 0
-; CM: LV: Found uniform instruction:   %b = extractvalue { float, float } %sv, 1
+; Ensure the extractvalue + add instructions are hoisted out
+; CM: vector.ph:
+; CM:  CLONE ir<%a> = extractvalue ir<%sv>
+; CM:  CLONE ir<%b> = extractvalue ir<%sv>
+; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  Successor(s): vector loop
 
 ; CM: LV: Scalar loop costs: 14.
-; CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
 
 ; FORCED-LABEL: define void @test_getVectorCallCost
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
index 2b881fe19902e..8320608d67588 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
@@ -5,6 +5,8 @@
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
 
 ; Tests for selecting interleave counts for loops with loads and stores.
 
@@ -213,6 +215,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-2:       exit:
 ; INTERLEAVE-2-NEXT:    ret void
 ;
+; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store(
+; INTERLEAVE-4-VLA:       call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT:  call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT:  call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT:  call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index bf64dccdb2667..fc2f8a0dcabf5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -5,6 +5,8 @@
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
 ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
 
 ; Tests for selecting the interleave count for loops with reductions.
 
@@ -138,6 +140,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-2-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; INTERLEAVE-2-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
+; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
+; INTERLEAVE-4-VLA:       add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index e366deb3ff777..89d5ba6e879df 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-COST-LABEL: sadd
 ; CHECK-COST: Found an estimated cost of 6 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Cost of 4 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @saddsat(
@@ -129,10 +129,10 @@ while.end:                                        ; preds = %while.body, %entry
 
 ; CHECK-COST-LABEL: umin
 ; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 16: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
 
 define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @umin(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 6b4cfa091c45e..93bc131ee5c5a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -5,8 +5,8 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
-; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
+; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
 ; CHECK-COST: Selecting VF: 1.
 
 ; We should decide this loop is not worth vectorising using fixed width vectors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
index 5b90456a4f458..05e0b7ca7520f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
@@ -10,10 +10,10 @@ target triple = "aarch64"
 ; due to invalid cost decisions. The loop below has a low maximum trip count,
 ; so will be masked.
 
-; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %0 = load
+; COST: Cost of 3000000 for VF 2: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 4: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 8: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 16: REPLICATE ir<%0> = load
 ; COST: LV: Selecting VF: 1.
 
 define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
new file mode 100644
index 0000000000000..9e42c3c5dcab7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
@@ -0,0 +1,118 @@
+; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+define noundef i32 @V1(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #0 {
+; CHECK-LABEL: @V1(
+; CHECK-NOT:   vec.epilog.ph:
+; CHECK-NOT:   vec.epilog.vector.body:
+; CHECK-NOT:   vec.epilog.middle.block:
+; CHECK-NOT:   vec.epilog.scalar.ph:
+;
+entry:
+  %4 = icmp sgt i32 %2, 0
+  br i1 %4, label %5, label %8
+
+5:
+  %6 = zext nneg i32 %2 to i64
+  br label %9
+
+7:
+  br label %8
+
+8:
+  ret i32 42
+
+9:
+  %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+  %11 = getelementptr inbounds double, ptr %0, i64 %10
+  %12 = load double, ptr %11, align 8
+  %13 = getelementptr inbounds double, ptr %1, i64 %10
+  %14 = load double, ptr %13, align 8
+  %15 = fadd fast double %14, %12
+  store double %15, ptr %11, align 8
+  %16 = add nuw nsw i64 %10, 1
+  %17 = icmp eq i64 %16, %6
+  br i1 %17, label %7, label %9
+}
+
+define noundef i32 @V2(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #1 {
+;
+; CHECK-LABEL: @V2(
+; CHECK:       vec.epilog.ph:
+; CHECK:       vec.epilog.vector.body:
+; CHECK:       vec.epilog.middle.block:
+; CHECK:       vec.epilog.scalar.ph:
+;
+entry:
+  %4 = icmp sgt i32 %2, 0
+  br i1 %4, label %5, label %8
+
+5:
+  %6 = zext nneg i32 %2 to i64
+  br label %9
+
+7:
+  br label %8
+
+8:
+  ret i32 42
+
+9:
+  %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+  %11 = getelementptr inbounds double, ptr %0, i64 %10
+  %12 = load double, ptr %11, align 8
+  %13 = getelementptr inbounds double, ptr %1, i64 %10
+  %14 = load double, ptr %13, align 8
+  %15 = fadd fast double %14, %12
+  store double %15, ptr %11, align 8
+  %16 = add nuw nsw i64 %10, 1
+  %17 = icmp eq i64 %16, %6
+  br i1 %17, label %7, label %9
+}
+
+; TODO: The V3 will generate a scalable vector body, so doesn't need a
+; epilogue loop, but will need to be checked that is really the best thing to
+; for the V3.
+;
+define noundef i32 @V3(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #2 {
+;
+; CHECK-LABEL: @V3(
+; CHECK-NOT:   vec.epilog.ph:
+; CHECK-NOT:   vec.epilog.vector.body:
+; CHECK-NOT:   vec.epilog.middle.block:
+; CHECK-NOT:   vec.epilog.scalar.ph:
+;
+entry:
+  %4 = icmp sgt i32 %2, 0
+  br i1 %4, label %5, label %8
+
+5:
+  %6 = zext nneg i32 %2 to i64
+  br label %9
+
+7:
+  br label %8
+
+8:
+  ret i32 42
+
+9:
+  %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+  %11 = getelementptr inbounds double, ptr %0, i64 %10
+  %12 = load double, ptr %11, align 8
+  %13 = getelementptr inbounds double, ptr %1, i64 %10
+  %14 = load double, ptr %13, align 8
+  %15 = fadd fast double %14, %12
+  store double %15, ptr %11, align 8
+  %16 = add nuw nsw i64 %10, 1
+  %17 = icmp eq i64 %16, %6
+  br i1 %17, label %7, label %9
+}
+
+attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve2" }
+
+attributes #1 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve2" }
+
+attributes #2 = { vscale_range(1,16) "target-cpu"="neoverse-v3" "target-features"="+sve2" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
index 785241d342ddc..089d279d15245 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -6,7 +6,6 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-LABEL: all_scalar
 ; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
 ; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
 ;
 define void @all_scalar(ptr %a, i64 %n) {
@@ -27,7 +26,6 @@ for.end:
 
 ; CHECK-LABEL: PR33193
 ; CHECK:       LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
-; CHECK:       LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
 ; CHECK:       LV: Not considering vector loop of width 8 because it will not generate any vector instructions
 %struct.a = type { i32, i8 }
 define void @PR33193(ptr %a, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
index 92b043a9c29d5..fb5ff66989a67 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
@@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;; registers required for a <vscale x 4 x fp128> when trying to maximize
 ;; vector bandwidth with SVE.
 
-; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128
+; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext  ir<%load.in> to fp128
 
 define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
 ; CHECK-LABEL: define void @load_ext_trunc_store(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index f28f77bf1b155..225108fe89de0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -1,49 +1,55 @@
 ; REQUIRES: asserts
 ; RUN: opt -mtriple=aarch64 -mattr=+sve \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
 
-; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16
+
+; GENERIC: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.
+; GENERIC: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.
+; GENERIC: LV: Selecting VF: vscale x 16
 
-; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
-; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+; NEOVERSE-V1: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.
+; NEOVERSE-V1: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.
+; NEOVERSE-V1: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
-; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+; NEOVERSE-N2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.
+; NEOVERSE-N2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.
+; NEOVERSE-N2: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
-; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
+; NEOVERSE-V2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.
+; NEOVERSE-V2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.
+; NEOVERSE-V2: LV: Selecting VF: 16
 
-; VF-4: <4 x i32>
-; VF-VSCALE4: <16 x i32>
+; VF-16: <16 x i8>
+; VF-VSCALE16: <vscale x 16 x i8>
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 entry:
   br label %loop
 
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
-  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
   %1 = load i8, ptr %arrayidx2, align 4
-  %zext = zext i8 %1 to i32
-  %add = add nsw i32 %zext, %0
-  %arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %add, ptr %arrayidx5, align 4
+  %add = add nsw i8 %0, %1
+  %arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv
+  store i8 %add, ptr %arrayidx5, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %exit, label %loop
@@ -51,4 +57,3 @@ loop:
 exit:
   ret void
 }
-
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
index 2bcc93127da1e..6165a73e77f23 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -6,13 +6,15 @@ target triple = "arm64-apple-ios5.0.0"
 
 define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
 ; CHECK: LV: Checking a loop in 'selects_1'
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
 
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
+
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
+
 ; CHECK: LV: Selecting VF: 4
 
 entry:
@@ -48,9 +50,11 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 
 define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
 ; CHECK: LV: Checking a loop in 'multi_user_cmp'
-; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction:   %cmp1 = fcmp olt float %load1, 0.000000e+00
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
+; CHECK: Cost of 1 for VF 16:
+; CHECK:  any-of reduction   %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
+; CHECK: Cost of 1 for VF 16:
+; CHECK:  any-of reduction   %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
+; CHECK: Cost of 4 for VF 16: WIDEN ir<%cmp1> = fcmp olt ir<%load1>, ir<0.000000e+00>
 ; CHECK: LV: Selecting VF: 16.
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
index 454a9789142f8..52d343e4105c7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
 ; RUN:   -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
-; RUN:   -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
+; RUN:   -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-EPILOG-V2
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
 ; RUN:   -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
 
@@ -12,6 +12,11 @@ define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i6
 ; CHECK-EPILOG:      vec.epilog.vector.body:
 ; CHECK-EPILOG:        load <vscale x 4 x i16>
 
+; The epilogue loop gets vectorised vscale x 2 x i16 wide.
+; CHECK-EPILOG-V2:      vec.epilog.ph:
+; CHECK-EPILOG-V2:      vec.epilog.vector.body:
+; CHECK-EPILOG-V2:        load <vscale x 2 x i16>
+
 ; CHECK-NO-EPILOG-NOT:  vec.epilog.vector.ph:
 ; CHECK-NO-EPILOG-NOT:  vec.epilog.vector.body:
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 961fa59cadd36..b418905e514af 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -4,9 +4,9 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %addi7 = add i7 %indvars.iv1294, 0
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN ir<%addi7> = add ir<%indvars.iv1294>, ir<0>
 
 define void @induction_i7(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i7(
@@ -71,9 +71,9 @@ for.end:                                          ; preds = %for.body
 }
 
 
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %zexti3 = zext i3 %indvars.iv1294 to i64
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext  ir<%indvars.iv1294> to i64
 
 define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i3_zext(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
index 83bd07a3ec02c..b84e8de678140 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
@@ -7,10 +7,15 @@
 
 target triple="aarch64-unknown-linux-gnu"
 
-; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction:   %add = fadd float %0, %sum.07
-; CHECK: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-LABEL: LV: Checking a loop in 'fadd_strict32'
+; CHECK: Cost of 4 for VF vscale x 2:
+; CHECK:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK: Cost of 8 for VF vscale x 4:
+; CHECK:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 4 for VF vscale x 4:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd float %0, %sum.07
 
 define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
 entry:
@@ -31,8 +36,11 @@ for.end:
 }
 
 
-; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction:   %add = fadd double %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-LABEL: LV: Checking a loop in 'fadd_strict64'
+; CHECK: Cost of 4 for VF vscale x 2:
+; CHECK:  in-loop reduction   %add = fadd double %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd double %0, %sum.07
 
 define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
index dec3c286345ad..468cc8f2a7278 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'zext_i8_i16'
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 2 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 4 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 8 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 16 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %conv = zext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %conv = zext i8 %0 to i32
 ; CHECK-LABEL: define void @zext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -91,15 +91,14 @@ exit:                                 ; preds = %for.body
 
 define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'sext_i8_i16'
-; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 2 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 4 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF 8 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 16 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %conv = sext i8 %0 to i32
-; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %conv = sext i8 %0 to i32
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
 ; CHECK-LABEL: define void @sext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index e7a361ecdbff0..57bc21093ca95 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -4,6 +4,7 @@
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
+; CHECK-LABEL: LV: Checking a loop in 'expensive_icmp'
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i16, ptr %arrayidx, align 2
@@ -18,48 +19,57 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 ; CHECK: LV: Scalar loop costs: 5.
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 20 for VF 2 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 16 for VF 2 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 2 costs: 43.
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 4 costs: 2.
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load i16, ptr %arrayidx, align 2
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv = sext i16 %1 to i32
-; CHECK: LV: Found an estimated cost of 36 for VF 8 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv6 = add i16 %1, %0
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %arrayidx7 = getelementptr inbounds i16, ptr %d, i32 %i.016
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i16 %conv6, ptr %arrayidx7, align 2
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br label %for.inc
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %inc = add nuw nsw i32 %i.016, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Vector loop of width 8 costs: 5.
+; CHECK: Cost of 1 for VF 2: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 2: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 2: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 20 for VF 2: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 26 for VF 2: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 16 for VF 2: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 2: 86 (Estimated cost per lane: 43.
+; CHECK: Cost of 1 for VF 4: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 4: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 4: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 4: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 2 for VF 4: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 4: 10 (Estimated cost per lane: 2.
+; CHECK: Cost of 1 for VF 8: induction instruction   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 8: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%4>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 36 for VF 8: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
+; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%5> = vector-pointer ir<%arrayidx7>
+; CHECK: Cost of 2 for VF 8: WIDEN store vp<%5>, ir<%conv6>, ir<%cmp2>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 8: 46 (Estimated cost per lane: 5.
 ; CHECK: LV: Selecting VF: 4.
 define void @expensive_icmp(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n, i16 zeroext %m) #0 {
 entry:
@@ -94,6 +104,7 @@ for.inc:                                          ; preds = %for.body, %if.then
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; CHECK-LABEL: LV: Checking a loop in 'cheap_icmp'
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
@@ -115,90 +126,134 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp.not = icmp eq i32 %dec, 0
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
 ; CHECK: LV: Scalar loop costs: 9.
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 22 for VF 2 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 2 costs: 65.
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 4 costs: 3.
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 8 costs: 3.
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %0 = load i8, ptr %pSrcA.addr.011, align 1
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv1 = sext i8 %0 to i32
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load i8, ptr %pSrcB.addr.09, align 1
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv3 = sext i8 %1 to i32
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %mul = mul nsw i32 %conv3, %conv1
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %shr = ashr i32 %mul, 7
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %2 = icmp slt i32 %shr, 127
-; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
-; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   store i8 %conv4, ptr %pDst.addr.010, align 1
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %dec = add i32 %blkCnt.012, -1
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %cmp.not = icmp eq i32 %dec, 0
-; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
-; CHECK: LV: Vector loop of width 16 costs: 3.
+; CHECK: Cost of 1 for VF 2: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 2: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 2: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 2: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 2: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 2: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 26 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 18 for VF 2: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 2: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 22 for VF 2: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 2: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 18 for VF 2: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 2: 130 (Estimated cost per lane: 65.
+; CHECK: Cost of 1 for VF 4: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 4: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 4: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 4: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 4: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 4: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 4: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 4: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 4: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 4: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 4: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 4: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 4: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 4: 14 (Estimated cost per lane: 3.
+; CHECK: Cost of 1 for VF 8: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 8: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 8: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 8: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 8: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 8: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 8: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 8: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 8: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 8: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 4 for VF 8: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 4 for VF 8: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 8: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 4 for VF 8: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 8: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 8: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 8: 26 (Estimated cost per lane: 3.
+; CHECK: Cost of 1 for VF 16: induction instruction   %dec = add i32 %blkCnt.012, -1
+; CHECK: Cost of 0 for VF 16: induction instruction   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr5 = getelementptr inbounds i8, ptr %pDst.addr.010, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1
+; CHECK: Cost of 0 for VF 16: induction instruction   %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: Cost of 1 for VF 16: exit condition instruction   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 0 for VF 16: vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<%3>
+; CHECK: Cost of 0 for VF 16: vp<%4> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<%4>
+; CHECK: Cost of 0 for VF 16: vp<%5> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<%5>
+; CHECK: Cost of 0 for VF 16: vp<%6> = vector-pointer vp<%next.gep>
+; CHECK: Cost of 2 for VF 16: WIDEN ir<%0> = load vp<%6>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 16: vp<%7> = vector-pointer vp<%next.gep>.2
+; CHECK: Cost of 2 for VF 16: WIDEN ir<%1> = load vp<%7>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 8 for VF 16: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
+; CHECK: Cost of 8 for VF 16: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
+; CHECK: Cost of 0 for VF 16: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
+; CHECK: Cost of 8 for VF 16: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 16: vp<%8> = vector-pointer vp<%next.gep>.1
+; CHECK: Cost of 2 for VF 16: WIDEN store vp<%8>, ir<%conv4>
+; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK: Cost for VF 16: 50
 ; CHECK: LV: Selecting VF: 16.
 define void @cheap_icmp(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) #0 {
 entry:
@@ -238,8 +293,8 @@ while.end:                                        ; preds = %while.end.loopexit,
 }
 
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp1 = fcmp
-; CHECK: LV: Found an estimated cost of 12 for VF 2 For instruction:   %cmp1 = fcmp
-; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction:   %cmp1 = fcmp
+; CHECK: Cost of 12 for VF 2: WIDEN ir<%cmp1> = fcmp olt ir<%0>, ir<0.000000e+00>
+; CHECK: Cost of 24 for VF 4: WIDEN ir<%cmp1> = fcmp olt ir<%0>, ir<0.000000e+00>
 define void @floatcmp(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) #0 {
 entry:
   %cmp.not7 = icmp eq i32 %blockSize, 0
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
index bd7bafdb6dc8d..04a97f451770a 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
@@ -7,10 +7,10 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: arm_offset_q15
-; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 36 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 8 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 2 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: LV: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 8 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 2 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @arm_offset_q15(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @arm_offset_q15(
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index 60c3b52f2003a..025b8738f7829 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -8,8 +8,8 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: test
 ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
-; CHECK-COST: LV: Found an estimated cost of 26 for VF 2 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %or.cond = select i1 %cmp2, i1 true, i1 %cmp3
+; CHECK-COST: Cost of 26 for VF 2: WIDEN-SELECT ir<%or.cond> = select ir<%cmp2>, ir<true>, ir<%cmp3>
+; CHECK-COST: Cost of 2 for VF 4: WIDEN-SELECT ir<%or.cond> = select ir<%cmp2>, ir<true>, ir<%cmp3>
 
 define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %blockSize) #0 {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
index a0ff7629b42c3..e1b7b935a47f6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
@@ -8,8 +8,8 @@ target triple = "thumbv8.1m.main-none-none-eabi"
 ; CHECK-LABEL: test
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %and515 = shl i32 %l41, 3
 ; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %l45 = and i32 %and515, 131072
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %and515 = shl i32 %l41, 3
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %l45 = and i32 %and515, 131072
+; CHECK-COST: Cost of 2 for VF 4: WIDEN ir<%and515> = shl ir<%l41>, ir<3>
+; CHECK-COST: Cost of 2 for VF 4: WIDEN ir<%l45> = and ir<%and515>, ir<131072>
 ; CHECK-NOT: vector.body
 
 define void @test(ptr %src, i32 %N) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
index f33ef9199ed1a..09a1aaab6cc2d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll
@@ -4,7 +4,7 @@
 ; CHECK: LV: Loop hints: force=enabled
 ; CHECK: LV: Scalar loop costs: 4.
 ; ChosenFactor.Cost is 4, but the real cost will be divided by the width, which is 2.
-; CHECK: LV: Vector loop of width 2 costs: 2.
+; CHECK: Cost for VF 2: 4 (Estimated cost per lane: 2.0)
 ; Regardless of force vectorization or not, this loop will eventually be vectorized because of the cost model.
 ; Therefore, the following message does not need to be printed even if vectorization is explicitly forced in the metadata.
 ; CHECK-NOT: LV: Vectorization seems to be not beneficial, but was forced by a user.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
index c75899e2224f1..a0c45962021c8 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
@@ -11,10 +11,12 @@
 ; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
 ;
 ; CHECK: LV: Checking a loop in 'fun0'
-; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
-; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+; CHECK: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 4 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
+
 define void @fun0(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -48,7 +50,8 @@ for.end:
 ; which gives a cost of 5.
 ;
 ; CHECK: LV: Checking a loop in 'fun1'
-; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+; CHECK: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 3 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
 define void @fun1(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -73,10 +76,11 @@ for.end:
 ; produce the vector values, which gives a cost of 6.
 ;
 ; CHECK: LV: Checking a loop in 'fun2'
-; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 32 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
 define void @fun2(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
@@ -112,10 +116,11 @@ for.end:
 ; vector register boundary.
 ;
 ; CHECK: LV: Checking a loop in 'fun3'
-; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 30 at %ld0, vp<%next.gep>
+; CHECK:   ir<%ld0> = load from index 0
+; CHECK:   ir<%ld1> = load from index 1
+; CHECK:   ir<%ld2> = load from index 2
+; CHECK:   ir<%ld3> = load from index 3
 define void @fun3(ptr %ptr, ptr %dst) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
index dae341bcf5380..f7d77bafc2a11 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll
@@ -6,8 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %neg = fneg float %{{.*}}
-; CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %neg = fneg float %{{.*}}
-; CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %neg = fneg float %{{.*}}
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%neg> = fneg ir<%0>
+; CHECK: Cost of 1 for VF 4: WIDEN ir<%neg> = fneg ir<%0>
 define void @fneg_cost(ptr %a, i64 %n) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
index 654376cc54f4a..a6d38fb794d44 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -32,38 +32,38 @@ for.body:                                         ; preds = %for.body.preheader,
   %conv3 = sext i8 %1 to i32
 ; sources of the mul is sext\sext from i8
 ; use pmullw\sext seq.
-; SLM:  cost of 3 for VF 2 {{.*}} mul nsw i32 %conv3, %conv
+; SLM: Cost of 3 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv>
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i8
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 %conv4, %conv
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul2> = mul nsw ir<%conv4>, ir<%conv>
   %conv4 = zext i8 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i8
 ; use pmullw\zext
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 %conv5, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul3> = mul nsw ir<%conv5>, ir<%conv4>
   %conv5 = zext i8 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-120
 ; use pmullw\sext
-; SLM:  cost of 3 for VF 2 {{.*}} mul nsw i32 -120, %conv3
+; SLM: Cost of 3 for VF 2: WIDEN ir<%mul4> = mul nsw ir<-120>, ir<%conv3>
   %mul4 = mul nsw i32 -120, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\250
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv3
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul5> = mul nsw ir<250>, ir<%conv3>
   %mul5 = mul nsw i32 250, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-120
 ; use pmulhw\pmullw\pshuf
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 -120, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul6> = mul nsw ir<-120>, ir<%conv4>
   %mul6 = mul nsw i32 -120, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\250
 ; use pmullw\zext
-; SLM:  cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv4
+; SLM: Cost of 2 for VF 2: WIDEN ir<%mul7> = mul nsw ir<250>, ir<%conv4>
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
@@ -101,38 +101,38 @@ for.body:                                         ; preds = %for.body.preheader,
   %conv3 = sext i16 %1 to i32
 ; sources of the mul is sext\sext from i16
 ; use pmulhw\pmullw\pshuf seq.
-; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32 %conv3, %conv
+; SLM: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv>
   %mul = mul nsw i32 %conv3, %conv
 ; sources of the mul is zext\sext from i16
 ; use pmulld
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 %conv4, %conv
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul2> = mul nsw ir<%conv4>, ir<%conv>
   %conv4 = zext i16 %1 to i32
   %mul2 = mul nsw i32 %conv4, %conv
   %sum0 = add i32 %mul, %mul2
 ; sources of the mul is zext\zext from i16
 ; use pmulhw\pmullw\zext
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 %conv5, %conv4
+; SLM: Cost of 5 for VF 4: WIDEN ir<%mul3> = mul nsw ir<%conv5>, ir<%conv4>
   %conv5 = zext i16 %0 to i32
   %mul3 = mul nsw i32 %conv5, %conv4
   %sum1 = add i32 %sum0, %mul3
 ; sources of the mul is sext\-32000
 ; use pmulhw\pmullw\sext
-; SLM:  cost of 2 for VF 4 {{.*}} mul nsw i32 -32000, %conv3
+; SLM: Cost of 2 for VF 4: WIDEN ir<%mul4> = mul nsw ir<-32000>, ir<%conv3>
   %mul4 = mul nsw i32 -32000, %conv3
   %sum2 = add i32 %sum1, %mul4
 ; sources of the mul is sext\64000
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32 64000, %conv3
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul5> = mul nsw ir<64000>, ir<%conv3>
   %mul5 = mul nsw i32 64000, %conv3
   %sum3 = add i32 %sum2, %mul5
 ; sources of the mul is zext\-32000
 ; use pmulld
-; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32 -32000, %conv4
+; SLM: Cost of 11 for VF 4: WIDEN ir<%mul6> = mul nsw ir<-32000>, ir<%conv4>
   %mul6 = mul nsw i32 -32000, %conv4
   %sum4 = add i32 %sum3, %mul6
 ; sources of the mul is zext\64000
 ; use pmulhw\pmullw\zext
-; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32 250, %conv4
+; SLM: Cost of 5 for VF 4: WIDEN ir<%mul7> = mul nsw ir<250>, ir<%conv4>
   %mul7 = mul nsw i32 250, %conv4
   %sum5 = add i32 %sum4, %mul7
   %add = add i32 %acc.013, 5
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index 7d435cc85c9df..052a963f5458b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -28,21 +28,28 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = trunc
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = icmp
 ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   br
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = trunc
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = icmp
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
+; CHECK: Cost of 1 for VF 2: induction instruction   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: Cost of 1 for VF 2: induction instruction   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK: Cost of 1 for VF 2: exit condition instruction   %exitcond = icmp eq i32 %lftr.wideiv, %n
+; CHECK: Cost of 0 for VF 2: exit condition instruction   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi ir<0>, vp<%8>
+; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%5>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%arrayidx2>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%1> = load vp<%6>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN ir<%conv4> = and ir<%sum.013>, ir<255>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%add> = add ir<%conv>, ir<%conv4>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%add5> = add ir<%add>, ir<%conv3>
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc  ir<%add5> to i8
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext  vp<%7> to i32
+; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%0>
+; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
 ;
 define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
index 1660f1af06c6c..21146e2e82216 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
@@ -4,11 +4,11 @@
 ; Check that cost model is not executed twice for VF=2 when vectorization is
 ; forced for a particular loop.
 
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
-; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
-; CHECK: LV: Vector loop of width 2 costs: {{[0-9]+}}.
+; CHECK: Cost of {{.*}} for VF 2: WIDEN {{.*}} = load
+; CHECK: Cost of {{.*}} for VF 2: WIDEN store
+; CHECK-NOT: Cost of {{.*}} for VF 2: WIDEN {{.*}} = load
+; CHECK-NOT: Cost of {{.*}} for VF 2: WIDEN store
+; CHECK: Cost for VF 2: 5 (Estimated cost per lane: 2.
 
 define i32 @foo(ptr %A, i32 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index ef0402aaae802..b8dcfd31bbc4c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 
 ; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 10 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
+; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
 define void @uint64_to_double_cost(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
index d56740c8293c0..166875dd55aae 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -3,8 +3,8 @@
 
 ; CHECK: 'foo'
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %shift = ashr i32 %val, %k
-; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %shift = ashr i32 %val, %k
-; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+; CHECK: Cost of 2 for VF 2: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
+; CHECK: Cost of 2 for VF 4: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
 define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr #0 {
 entry:
   br label %body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
index 234587aae1283..22eb0ca380033 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -23,8 +23,8 @@ define void @scalarselect(i1 %cond) {
   %7 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv
 
 ; CHECK: cost of 1 for VF 1 {{.*}}  select i1 %cond, i32 %6, i32 0
-; CHECK: cost of 2 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
-; CHECK: cost of 2 for VF 4 {{.*}}  select i1 %cond, i32 %6, i32 0
+; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant)
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant)
 
   %sel = select i1 %cond, i32 %6, i32 zeroinitializer
   store i32 %sel, ptr %7, align 4
@@ -52,8 +52,8 @@ define void @vectorselect(i1 %cond) {
   %8 = icmp ult i64 %indvars.iv, 8
 
 ; CHECK: cost of 1 for VF 1 {{.*}}  select i1 %8, i32 %6, i32 0
-; CHECK: cost of 2 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
-; CHECK: cost of 2 for VF 4 {{.*}}  select i1 %8, i32 %6, i32 0
+; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%8>, ir<%6>, ir<0>
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%8>, ir<%6>, ir<0>
 
   %sel = select i1 %8, i32 %6, i32 zeroinitializer
   store i32 %sel, ptr %7, align 4
diff --git a/llvm/test/Transforms/NewGVN/tbaa.ll b/llvm/test/Transforms/NewGVN/tbaa.ll
index 335e782acc8bc..20c09aa68726a 100644
--- a/llvm/test/Transforms/NewGVN/tbaa.ll
+++ b/llvm/test/Transforms/NewGVN/tbaa.ll
@@ -4,7 +4,7 @@
 define i32 @test1(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test1(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]])
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -17,7 +17,7 @@ define i32 @test1(ptr %p, ptr %q) {
 define i32 @test2(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test2(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -56,7 +56,7 @@ define i32 @test4(ptr %p, ptr %q) {
 define i32 @test5(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test5(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -69,7 +69,7 @@ define i32 @test5(ptr %p, ptr %q) {
 define i32 @test6(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test6(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -82,7 +82,7 @@ define i32 @test6(ptr %p, ptr %q) {
 define i32 @test7(ptr %p, ptr %q) {
 ; CHECK-LABEL: define i32 @test7(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]])
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -129,7 +129,7 @@ define i32 @test10(ptr %p, ptr %q) {
 ; and not just the common final access type.
 ; CHECK-LABEL: define i32 @test10(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
-; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA10:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -172,8 +172,11 @@ declare i32 @foo(ptr) readonly
 ; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
 ; CHECK: [[META5]] = !{!"B", [[META2]]}
 ; CHECK: [[TBAA6]] = !{[[META2]], [[META2]], i64 0}
-; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 0}
-; CHECK: [[META8]] = !{!"struct X", [[META9]], i64 0}
-; CHECK: [[META9]] = !{!"int", [[META10:![0-9]+]], i64 0}
-; CHECK: [[META10]] = !{!"char", [[META3]], i64 0}
+; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[META8]] = !{!"scalar type", [[META9:![0-9]+]]}
+; CHECK: [[META9]] = !{!"another root"}
+; CHECK: [[TBAA10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], i64 0}
+; CHECK: [[META11]] = !{!"struct X", [[META12]], i64 0}
+; CHECK: [[META12]] = !{!"int", [[META13:![0-9]+]], i64 0}
+; CHECK: [[META13]] = !{!"char", [[META3]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index 816ed6e831153..076cc3f4dddd1 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -15,83 +15,39 @@ define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %f
 ; CHECK:       [[FOR_BODY_PREHEADER]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NFACE]] to i64
 ; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[TMP0]]
-; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NFACE]], 4
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER_NEW]]:
-; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP0]], 2147483644
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_BODY_PREHEADER14:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER14]]:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[UNROLL_ITER:%.*]], %[[MIDDLE_BLOCK:.*]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]:
-; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
-; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY_EPIL:.*]]
-; CHECK:       [[FOR_BODY_EPIL]]:
-; CHECK-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ], [ [[INDVARS_IV_UNR]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]] ]
-; CHECK-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ], [ 0, %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[UNROLL_ITER]] = and i64 [[TMP0]], 2147483644
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_EPIL]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0:![0-9]+]], !llvm.access.group [[ACC_GRP4:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0:![0-9]+]], !llvm.access.group [[ACC_GRP4:![0-9]+]]
 ; CHECK-NEXT:    [[GEP_EPIL:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_EPIL]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_EPIL]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[IDXPROM3_EPIL:%.*]] = sext i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX4_EPIL:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_EPIL]]
-; CHECK-NEXT:    [[IDXPROM5_EPIL:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_EPIL]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX4_EPIL]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX6_EPIL]], align 8
-; CHECK-NEXT:    [[CMP_I_EPIL:%.*]] = fcmp fast olt double [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[CMP_I_EPIL]], double [[TMP5]], double [[TMP4]]
-; CHECK-NEXT:    store double [[TMP6]], ptr [[ARRAYIDX4_EPIL]], align 8, !tbaa [[TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
-; CHECK-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
-; CHECK-NEXT:    [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY_EPIL]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[GEP_EPIL]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[Y]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD12]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[X]], <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast olt <4 x double> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x double> [[WIDE_MASKED_GATHER13]], <4 x double> [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP8]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV_EPIL]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UNROLL_ITER]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY_PREHEADER14]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_3]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[GEP]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3]]
-; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast olt double [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[CMP_I]], double [[TMP10]], double [[TMP9]]
-; CHECK-NEXT:    store double [[TMP11]], ptr [[ARRAYIDX4]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[GEP_1]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[IDXPROM3_1:%.*]] = sext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[ARRAYIDX4_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_1]]
-; CHECK-NEXT:    [[IDXPROM5_1:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX4_1]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX6_1]], align 8
-; CHECK-NEXT:    [[CMP_I_1:%.*]] = fcmp fast olt double [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[CMP_I_1]], double [[TMP15]], double [[TMP14]]
-; CHECK-NEXT:    store double [[TMP16]], ptr [[ARRAYIDX4_1]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or disjoint i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_NEXT_1]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_NEXT_1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[GEP_2]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[IDXPROM3_2:%.*]] = sext i32 [[TMP17]] to i64
-; CHECK-NEXT:    [[ARRAYIDX4_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_2]]
-; CHECK-NEXT:    [[IDXPROM5_2:%.*]] = sext i32 [[TMP18]] to i64
-; CHECK-NEXT:    [[ARRAYIDX6_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX4_2]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load double, ptr [[ARRAYIDX6_2]], align 8
-; CHECK-NEXT:    [[CMP_I_2:%.*]] = fcmp fast olt double [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[CMP_I_2]], double [[TMP20]], double [[TMP19]]
-; CHECK-NEXT:    store double [[TMP21]], ptr [[ARRAYIDX4_2]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or disjoint i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER14]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_NEXT_2]]
@@ -100,15 +56,14 @@ define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %f
 ; CHECK-NEXT:    [[ARRAYIDX4_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_3]]
 ; CHECK-NEXT:    [[IDXPROM5_3:%.*]] = sext i32 [[TMP23]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX6_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_3]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX4_3]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[ARRAYIDX6_3]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX4_3]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[ARRAYIDX6_3]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[CMP_I_3:%.*]] = fcmp fast olt double [[TMP24]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[CMP_I_3]], double [[TMP25]], double [[TMP24]]
 ; CHECK-NEXT:    store double [[TMP26]], ptr [[ARRAYIDX4_3]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
-; CHECK-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
-; CHECK-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ;
 entry:
   %nface.addr = alloca i32, align 4
@@ -242,10 +197,10 @@ attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CHECK: [[ACC_GRP4]] = distinct !{}
 ; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
 ; CHECK: [[META6]] = !{!"double", [[META2]], i64 0}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]]}
-; CHECK: [[META8]] = !{!"llvm.loop.unroll.disable"}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]], [[META11:![0-9]+]], [[META12:![0-9]+]]}
-; CHECK: [[META10]] = !{!"llvm.loop.mustprogress"}
-; CHECK: [[META11]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP4]]}
-; CHECK: [[META12]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META11:![0-9]+]]}
+; CHECK: [[META8]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META9]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP4]]}
+; CHECK: [[META10]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META11]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META8]], [[META9]], [[META11]], [[META10]]}
 ;.
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
index 252fe7361f07c..57c877a58d5c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
@@ -551,3 +551,57 @@ define <4 x i32> @slp_v4i32_Op1_unknown_Op2_const_pow2(<4 x i32> %a)
   %r3 = insertelement <4 x i32> %r2, i32 %4, i32 3
   ret <4 x i32> %r3
 }
+
+; computes (a/const + x - y) * z
+define <2 x i32> @vectorize_sdiv_v2i32(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
+; NO-SVE-LABEL: define <2 x i32> @vectorize_sdiv_v2i32(
+; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; NO-SVE-NEXT:    [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0
+; NO-SVE-NEXT:    [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1
+; NO-SVE-NEXT:    [[TMP1:%.*]] = sdiv i32 [[A0]], 2
+; NO-SVE-NEXT:    [[TMP2:%.*]] = sdiv i32 [[A1]], 4
+; NO-SVE-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0
+; NO-SVE-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1
+; NO-SVE-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]]
+; NO-SVE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]]
+; NO-SVE-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0
+; NO-SVE-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1
+; NO-SVE-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]]
+; NO-SVE-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]]
+; NO-SVE-NEXT:    [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0
+; NO-SVE-NEXT:    [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1
+; NO-SVE-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]]
+; NO-SVE-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]]
+; NO-SVE-NEXT:    [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; NO-SVE-NEXT:    [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1
+; NO-SVE-NEXT:    ret <2 x i32> [[RES1]]
+;
+; SVE-LABEL: define <2 x i32> @vectorize_sdiv_v2i32(
+; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; SVE-NEXT:    [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
+; SVE-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
+; SVE-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
+; SVE-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
+; SVE-NEXT:    ret <2 x i32> [[TMP4]]
+;
+{
+  %a0 = extractelement <2 x i32> %a, i64 0
+  %a1 = extractelement <2 x i32> %a, i64 1
+  %1 = sdiv i32 %a0, 2
+  %2 = sdiv i32 %a1, 4
+  %x0 = extractelement <2 x i32> %x, i64 0
+  %x1 = extractelement <2 x i32> %x, i64 1
+  %3 = add i32 %1, %x0
+  %4 = add i32 %2, %x1
+  %y0 = extractelement <2 x i32> %y, i64 0
+  %y1 = extractelement <2 x i32> %y, i64 1
+  %5 = sub i32 %3, %y0
+  %6 = sub i32 %4, %y1
+  %z0 = extractelement <2 x i32> %z, i64 0
+  %z1 = extractelement <2 x i32> %z, i64 1
+  %7 = mul i32 %5, %z0
+  %8 = mul i32 %6, %z1
+  %res0 = insertelement <2 x i32> poison, i32 %7, i32 0
+  %res1 = insertelement <2 x i32> %res0, i32 %8, i32 1
+  ret <2 x i32> %res1
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
new file mode 100644
index 0000000000000..0cc4d3db5c537
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP9:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FMUL]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> [[TMP7]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    br label %[[BB1]]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb ], [ %or21, %bb1 ]
+  %sitofp = sitofp i32 0 to float
+  %fadd = fadd float %sitofp, %sitofp
+  %fadd2 = fadd float %fadd, 0.000000e+00
+  %fcmp = fcmp ogt float %fadd2, 0.000000e+00
+  %select = select i1 %fcmp, i32 0, i32 0
+  %select3 = select i1 false, i32 %select, i32 0
+  %fadd4 = fadd float %sitofp, 0.000000e+00
+  %fadd5 = fadd float %fadd4, 0.000000e+00
+  %fcmp6 = fcmp ogt float %fadd5, 0.000000e+00
+  %select7 = select i1 %fcmp6, i32 0, i32 0
+  %select8 = select i1 false, i32 %select7, i32 0
+  %or = or i32 %select3, %select8
+  %sitofp9 = sitofp i32 0 to float
+  %fmul = fmul float 0.000000e+00, 0.000000e+00
+  %fadd10 = fadd float %sitofp9, %fmul
+  %fadd11 = fadd float %fadd10, 0.000000e+00
+  %fcmp12 = fcmp ogt float %fadd11, 0.000000e+00
+  %select13 = select i1 %fcmp12, i32 0, i32 0
+  %select14 = select i1 false, i32 %select13, i32 0
+  %or15 = or i32 %select14, %or
+  %fadd16 = fadd float %fmul, 0.000000e+00
+  %fadd17 = fadd float %fadd16, 0.000000e+00
+  %fcmp18 = fcmp ogt float %fadd17, 0.000000e+00
+  %select19 = select i1 %fcmp18, i32 0, i32 0
+  %select20 = select i1 false, i32 %select19, i32 0
+  %or21 = or i32 %or15, %select20
+  br label %bb1
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-loads-non-full-reg.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-loads-non-full-reg.ll
new file mode 100644
index 0000000000000..79aba19ab02e1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-loads-non-full-reg.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mcpu=cascadelake < %s | FileCheck %s
+
+@solid_ = external global [608 x i8]
+
+define void @test(ptr noalias %0) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr noalias [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[_LR_PH1019:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 128
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 200
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 208
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 232
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 288
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 320
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP0]], i64 304
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 424
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 480
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP0]], i64 504
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 632
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP0]], i64 720
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd double [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load double, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd double [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load double, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP27:%.*]] = load double, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP28:%.*]] = load double, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd double [[TMP28]], [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = fmul double [[TMP22]], [[TMP18]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fmul double [[TMP30]], 0.000000e+00
+; CHECK-NEXT:    [[TMP32:%.*]] = fsub double 0.000000e+00, [[TMP25]]
+; CHECK-NEXT:    [[TMP33:%.*]] = fmul double [[TMP32]], 0.000000e+00
+; CHECK-NEXT:    [[TMP34:%.*]] = fadd double [[TMP33]], 0.000000e+00
+; CHECK-NEXT:    [[TMP35:%.*]] = fmul double [[TMP34]], 0.000000e+00
+; CHECK-NEXT:    [[TMP36:%.*]] = fmul double [[TMP29]], [[TMP26]]
+; CHECK-NEXT:    [[TMP37:%.*]] = fmul double [[TMP36]], 0.000000e+00
+; CHECK-NEXT:    [[TMP38:%.*]] = fadd double [[TMP37]], 0.000000e+00
+; CHECK-NEXT:    [[TMP39:%.*]] = fsub double [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fmul double [[TMP39]], [[TMP23]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fmul double [[TMP40]], 0.000000e+00
+; CHECK-NEXT:    [[TMP42:%.*]] = load double, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP43:%.*]] = load double, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP44:%.*]] = fmul double [[TMP43]], [[TMP31]]
+; CHECK-NEXT:    [[TMP45:%.*]] = load double, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP46:%.*]] = fmul double [[TMP35]], 0.000000e+00
+; CHECK-NEXT:    [[TMP47:%.*]] = fadd double [[TMP44]], 0.000000e+00
+; CHECK-NEXT:    [[TMP48:%.*]] = fmul double [[TMP45]], [[TMP38]]
+; CHECK-NEXT:    [[TMP49:%.*]] = fmul double [[TMP45]], [[TMP41]]
+; CHECK-NEXT:    store double [[TMP46]], ptr getelementptr inbounds (i8, ptr @solid_, i64 384), align 8
+; CHECK-NEXT:    store double [[TMP47]], ptr getelementptr inbounds (i8, ptr @solid_, i64 408), align 8
+; CHECK-NEXT:    store double [[TMP48]], ptr getelementptr inbounds (i8, ptr @solid_, i64 392), align 8
+; CHECK-NEXT:    store double [[TMP49]], ptr getelementptr inbounds (i8, ptr @solid_, i64 400), align 8
+; CHECK-NEXT:    [[DOTNEG965:%.*]] = fmul double [[TMP48]], [[TMP24]]
+; CHECK-NEXT:    [[REASS_ADD993:%.*]] = fadd double [[DOTNEG965]], 0.000000e+00
+; CHECK-NEXT:    [[TMP50:%.*]] = fadd double [[TMP42]], [[REASS_ADD993]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fsub double 0.000000e+00, [[TMP50]]
+; CHECK-NEXT:    store double [[TMP51]], ptr getelementptr inbounds (i8, ptr @solid_, i64 296), align 8
+; CHECK-NEXT:    [[DOTNEG969:%.*]] = fmul double [[TMP49]], 0.000000e+00
+; CHECK-NEXT:    [[REASS_ADD996:%.*]] = fadd double [[DOTNEG969]], 0.000000e+00
+; CHECK-NEXT:    [[TMP52:%.*]] = fadd double [[TMP45]], [[REASS_ADD996]]
+; CHECK-NEXT:    [[TMP53:%.*]] = fsub double 0.000000e+00, [[TMP52]]
+; CHECK-NEXT:    store double [[TMP53]], ptr getelementptr inbounds (i8, ptr @solid_, i64 304), align 8
+; CHECK-NEXT:    ret void
+;
+.lr.ph1019:
+  %1 = getelementptr i8, ptr %0, i64 8
+  %2 = getelementptr i8, ptr %0, i64 32
+  %3 = getelementptr i8, ptr %0, i64 128
+  %4 = getelementptr i8, ptr %0, i64 200
+  %5 = getelementptr i8, ptr %0, i64 208
+  %6 = getelementptr i8, ptr %0, i64 232
+  %7 = getelementptr i8, ptr %0, i64 288
+  %8 = getelementptr i8, ptr %0, i64 320
+  %9 = getelementptr i8, ptr %0, i64 304
+  %10 = getelementptr i8, ptr %0, i64 424
+  %11 = getelementptr i8, ptr %0, i64 480
+  %12 = getelementptr i8, ptr %0, i64 504
+  %13 = getelementptr i8, ptr %0, i64 632
+  %14 = getelementptr i8, ptr %0, i64 720
+  %15 = load double, ptr %1, align 8
+  %16 = load double, ptr %2, align 8
+  %17 = fadd double %16, %15
+  %18 = load double, ptr %3, align 8
+  %19 = load double, ptr %4, align 8
+  %20 = load double, ptr %5, align 8
+  %21 = load double, ptr %6, align 8
+  %22 = fadd double %21, %20
+  %23 = load double, ptr %7, align 8
+  %24 = load double, ptr %8, align 8
+  %25 = load double, ptr %9, align 8
+  %26 = load double, ptr %10, align 8
+  %27 = load double, ptr %11, align 8
+  %28 = load double, ptr %12, align 8
+  %29 = fadd double %28, %27
+  %30 = fmul double %22, %18
+  %31 = fmul double %30, 0.000000e+00
+  %32 = fsub double 0.000000e+00, %25
+  %33 = fmul double %32, 0.000000e+00
+  %34 = fadd double %33, 0.000000e+00
+  %35 = fmul double %34, 0.000000e+00
+  %36 = fmul double %29, %26
+  %37 = fmul double %36, 0.000000e+00
+  %38 = fadd double %37, 0.000000e+00
+  %39 = fsub double %17, %19
+  %40 = fmul double %39, %23
+  %41 = fmul double %40, 0.000000e+00
+  %42 = load double, ptr %0, align 8
+  %43 = load double, ptr %13, align 8
+  %44 = fmul double %43, %31
+  %45 = load double, ptr %14, align 8
+  %46 = fmul double %35, 0.000000e+00
+  %47 = fadd double %44, 0.000000e+00
+  %48 = fmul double %45, %38
+  %49 = fmul double %45, %41
+  store double %46, ptr getelementptr inbounds (i8, ptr @solid_, i64 384), align 8
+  store double %47, ptr getelementptr inbounds (i8, ptr @solid_, i64 408), align 8
+  store double %48, ptr getelementptr inbounds (i8, ptr @solid_, i64 392), align 8
+  store double %49, ptr getelementptr inbounds (i8, ptr @solid_, i64 400), align 8
+  %.neg965 = fmul double %48, %24
+  %reass.add993 = fadd double %.neg965, 0.000000e+00
+  %50 = fadd double %42, %reass.add993
+  %51 = fsub double 0.000000e+00, %50
+  store double %51, ptr getelementptr inbounds (i8, ptr @solid_, i64 296), align 8
+  %.neg969 = fmul double %49, 0.000000e+00
+  %reass.add996 = fadd double %.neg969, 0.000000e+00
+  %52 = fadd double %45, %reass.add996
+  %53 = fsub double 0.000000e+00, %52
+  store double %53, ptr getelementptr inbounds (i8, ptr @solid_, i64 304), align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
index 8091c218addfa..a2673d81068d8 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -105,3 +105,19 @@ entry:
   store <4 x i32> %2, ptr %6, align 4
   ret void
 }
+
+define void @test5(ptr %out) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = shufflevector <8 x i32> zeroinitializer, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i32> zeroinitializer, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = getelementptr inbounds i32, ptr %out, i64 0
+  %3 = getelementptr inbounds i32, ptr %out, i64 4
+  store <4 x i32> %0, ptr %2, align 4
+  store <4 x i32> %1, ptr %3, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
index 28d6b59e2b7f0..058aaea46fd09 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
@@ -973,23 +973,14 @@ define <vscale x 1 x i64> @xor_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 }
 
 define <vscale x 1 x i64> @smin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @smin_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.smin.i64(i64 [[Y:%.*]], i64 42)
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @smin_nxv1i64_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+; ALL-LABEL: @smin_nxv1i64_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1016,23 +1007,14 @@ define <vscale x 1 x i64> @smin_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 }
 
 define <vscale x 1 x i64> @smax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @smax_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 42)
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @smax_nxv1i64_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+; ALL-LABEL: @smax_nxv1i64_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1059,23 +1041,14 @@ define <vscale x 1 x i64> @smax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 }
 
 define <vscale x 1 x i64> @umin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @umin_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umin.i64(i64 [[Y:%.*]], i64 42)
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @umin_nxv1i64_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+; ALL-LABEL: @umin_nxv1i64_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1087,23 +1060,14 @@ define <vscale x 1 x i64> @umin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 }
 
 define <vscale x 1 x i64> @umax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @umax_nxv1i64_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[Y:%.*]], i64 42)
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @umax_nxv1i64_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
+; ALL-LABEL: @umax_nxv1i64_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
new file mode 100644
index 0000000000000..e601de8d706b4
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
@@ -0,0 +1,25 @@
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx950 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
+
+# CHECK: Iterations:        1
+# CHECK: Instructions:      7
+# CHECK: Total Cycles:      42
+# CHECK: Total uOps:        7
+
+v_mfma_ld_scale_b32 v0, v0
+
+v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7]
+v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15]
+v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2
+v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15]
+v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2
+
+
+# CHECK:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT: -      -      -      -     1.00    -      -    v_mfma_ld_scale_b32 v0, v0
+# CHECK-NEXT: -      -      -      -      -      -     4.00  v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+# CHECK-NEXT: -      -      -      -      -      -     4.00  v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7]
+# CHECK-NEXT: -      -      -      -      -      -     8.00  v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15]
+# CHECK-NEXT: -      -      -      -      -      -     8.00  v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2
+# CHECK-NEXT: -      -      -      -      -      -     8.00  v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15]
+# CHECK-NEXT: -      -      -      -      -      -     8.00  v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2
diff --git a/llvm/test/tools/llvm-mca/ARM/m4-ldr-str-w.s b/llvm/test/tools/llvm-mca/ARM/m4-ldr-str-w.s
new file mode 100644
index 0000000000000..cb4eeaff4b2a8
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/ARM/m4-ldr-str-w.s
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s
+str.w r1, [r0], #16
+str.w r1, [r0, 16]!
+strb.w r1, [r0], #16
+strb.w r1, [r0, 16]!
+strh.w r1, [r0], #16
+strh.w r1, [r0, 16]!
+ldr.w r1, [r0], #16
+ldr.w r1, [r0, 16]!
+ldrb.w r1, [r0], #16
+ldrb.w r1, [r0, 16]!
+ldrh.w r1, [r0], #16
+ldrh.w r1, [r0, 16]!
+ldrsb.w r1, [r0], #16
+ldrsb.w r1, [r0, 16]!
+ldrsh.w r1, [r0], #16
+ldrsh.w r1, [r0, 16]!
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1600
+# CHECK-NEXT: Total Cycles:      2601
+# CHECK-NEXT: Total uOps:        1600
+
+# CHECK:      Dispatch Width:    1
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00           *            str	r1, [r0], #16
+# CHECK-NEXT:  1      1     1.00           *            str	r1, [r0, #16]!
+# CHECK-NEXT:  1      1     1.00           *            strb	r1, [r0], #16
+# CHECK-NEXT:  1      1     1.00           *            strb	r1, [r0, #16]!
+# CHECK-NEXT:  1      1     1.00           *            strh	r1, [r0], #16
+# CHECK-NEXT:  1      1     1.00           *            strh	r1, [r0, #16]!
+# CHECK-NEXT:  1      2     1.00    *                   ldr	r1, [r0], #16
+# CHECK-NEXT:  1      2     1.00    *                   ldr	r1, [r0, #16]!
+# CHECK-NEXT:  1      2     1.00    *                   ldrb	r1, [r0], #16
+# CHECK-NEXT:  1      2     1.00    *                   ldrb	r1, [r0, #16]!
+# CHECK-NEXT:  1      2     1.00    *                   ldrh	r1, [r0], #16
+# CHECK-NEXT:  1      2     1.00    *                   ldrh	r1, [r0, #16]!
+# CHECK-NEXT:  1      2     1.00    *                   ldrsb	r1, [r0], #16
+# CHECK-NEXT:  1      2     1.00    *                   ldrsb	r1, [r0, #16]!
+# CHECK-NEXT:  1      2     1.00    *                   ldrsh	r1, [r0], #16
+# CHECK-NEXT:  1      2     1.00    *                   ldrsh	r1, [r0, #16]!
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - M4Unit
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]
+# CHECK-NEXT: 16.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    Instructions:
+# CHECK-NEXT: 1.00   str	r1, [r0], #16
+# CHECK-NEXT: 1.00   str	r1, [r0, #16]!
+# CHECK-NEXT: 1.00   strb	r1, [r0], #16
+# CHECK-NEXT: 1.00   strb	r1, [r0, #16]!
+# CHECK-NEXT: 1.00   strh	r1, [r0], #16
+# CHECK-NEXT: 1.00   strh	r1, [r0, #16]!
+# CHECK-NEXT: 1.00   ldr	r1, [r0], #16
+# CHECK-NEXT: 1.00   ldr	r1, [r0, #16]!
+# CHECK-NEXT: 1.00   ldrb	r1, [r0], #16
+# CHECK-NEXT: 1.00   ldrb	r1, [r0, #16]!
+# CHECK-NEXT: 1.00   ldrh	r1, [r0], #16
+# CHECK-NEXT: 1.00   ldrh	r1, [r0, #16]!
+# CHECK-NEXT: 1.00   ldrsb	r1, [r0], #16
+# CHECK-NEXT: 1.00   ldrsb	r1, [r0, #16]!
+# CHECK-NEXT: 1.00   ldrsh	r1, [r0], #16
+# CHECK-NEXT: 1.00   ldrsh	r1, [r0, #16]!
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
index c3453d890d76d..8736c1c6234af 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
@@ -467,6 +467,8 @@ vmovdqu64         %zmm16, (%rax) {k1}
 vmovdqu64         %zmm16, %zmm19 {z}{k1}
 vmovdqu64         (%rax), %zmm19 {z}{k1}
 
+vmovntdqa         (%rax), %zmm0
+
 vmovshdup         %zmm16, %zmm19
 vmovshdup         (%rax), %zmm19
 vmovshdup         %zmm16, %zmm19 {k1}
@@ -1497,6 +1499,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      1     1.00           *            vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      7     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     1.00    *                   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19 {%k1}
@@ -2050,7 +2053,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     1506.00 198.00 335.00 25.00 523.00 304.00 304.00
+# CHECK-NEXT:  -     1506.00 198.00 335.00 25.00 523.00 304.50 304.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -2463,6 +2466,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -     1.00    -     0.50   0.50   vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  -      -     0.50    -      -     0.50    -      -     vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%zmm16, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
index 4a4f77826437b..8bf3c21891f7f 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
@@ -750,6 +750,9 @@ vmovddup          (%rax), %ymm19 {k1}
 vmovddup          %ymm16, %ymm19 {z}{k1}
 vmovddup          (%rax), %ymm19 {z}{k1}
 
+{evex} vmovntdqa  (%rax), %xmm0
+{evex} vmovntdqa  (%rax), %ymm0
+
 vmovshdup         %xmm16, %xmm19
 vmovshdup         (%rax), %xmm19
 vmovshdup         %xmm16, %xmm19 {k1}
@@ -2397,6 +2400,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  2      8     1.00    *                   vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     1.00    *                   vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      6     0.50    *                   {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  1      7     0.50    *                   {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     1.00    *                   vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%xmm16, %xmm19 {%k1}
@@ -3264,7 +3269,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     1935.00 278.00 579.50 48.00 738.50 494.50 494.50
+# CHECK-NEXT:  -     1935.00 278.00 579.50 48.00 738.50 495.50 495.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -3936,6 +3941,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00   0.50   0.50   vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmovshdup	%xmm16, %xmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
index edd3e7cd8c0e7..0e7c5751e5c08 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
@@ -1300,8 +1300,8 @@ vzeroupper
 # CHECK-NEXT:  1      2     1.00                        vmovmskps	%ymm0, %ecx
 # CHECK-NEXT:  2      1     0.50           *            vmovntdq	%xmm0, (%rax)
 # CHECK-NEXT:  2      1     0.50           *            vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  2      7     0.50    *                   vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  2      8     0.50    *                   vmovntdqa	(%rax), %ymm2
 # CHECK-NEXT:  2      1     0.50           *            vmovntpd	%xmm0, (%rax)
 # CHECK-NEXT:  2      1     0.50           *            vmovntpd	%ymm0, (%rax)
 # CHECK-NEXT:  2      1     0.50           *            vmovntps	%xmm0, (%rax)
@@ -1738,7 +1738,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     126.00 325.58 256.58 160.50 160.50 19.00  270.58 6.25   19.00  19.00  19.00
+# CHECK-NEXT:  -     126.00 326.25 257.25 160.50 160.50 19.00  271.25 6.25   19.00  19.00  19.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2012,8 +2012,8 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vmovmskps	%ymm0, %ecx
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovntdq	%xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovntdqa	(%rax), %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovntpd	%xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovntpd	%ymm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovntps	%xmm0, (%rax)
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
index 81b178d12d2d3..c7a0be0cf9cde 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
@@ -475,7 +475,7 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  5      20    2.00    *                   vgatherqps	%xmm0, (%rax,%ymm1,2), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      7     0.50    *                   vinserti128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm0
+# CHECK-NEXT:  2      8     0.50    *                   vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  2      4     1.00                        vmpsadbw	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  3      11    1.00    *                   vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%ymm0, %ymm2
@@ -778,7 +778,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     110.33 116.33 98.00  98.00  2.50   137.33  -     2.50   2.50   2.50
+# CHECK-NEXT:  -      -     110.67 116.67 98.00  98.00  2.50   137.67  -     2.50   2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -797,7 +797,7 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     1.33   0.33   2.00   2.00    -     1.33    -      -      -      -     vgatherqps	%xmm0, (%rax,%ymm1,2), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vinserti128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovntdqa	(%rax), %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     1.50    -      -      -      -     vmpsadbw	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     1.50    -      -      -      -     vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vpabsb	%ymm0, %ymm2
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
index c509e766540b1..8b495d6ee268e 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
@@ -467,6 +467,8 @@ vmovdqu64         %zmm16, (%rax) {k1}
 vmovdqu64         %zmm16, %zmm19 {z}{k1}
 vmovdqu64         (%rax), %zmm19 {z}{k1}
 
+vmovntdqa         (%rax), %zmm0
+
 vmovshdup         %zmm16, %zmm19
 vmovshdup         (%rax), %zmm19
 vmovshdup         %zmm16, %zmm19 {k1}
@@ -1497,6 +1499,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  2      1     0.50           *            vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19 {%k1}
@@ -2054,7 +2057,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     612.00 411.17 103.67 327.50 327.50 48.50  593.17 6.00   48.50  48.50  48.50
+# CHECK-NEXT:  -     612.00 411.50 104.00 328.00 328.00 48.50  593.50 6.00   48.50  48.50  48.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2467,6 +2470,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovshdup	%zmm16, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
index 00e5c3b03f6f5..a57ed444f794e 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
@@ -750,6 +750,9 @@ vmovddup          (%rax), %ymm19 {k1}
 vmovddup          %ymm16, %ymm19 {z}{k1}
 vmovddup          (%rax), %ymm19 {z}{k1}
 
+{evex} vmovntdqa  (%rax), %xmm0
+{evex} vmovntdqa  (%rax), %ymm0
+
 vmovshdup         %xmm16, %xmm19
 vmovshdup         (%rax), %xmm19
 vmovshdup         %xmm16, %xmm19 {k1}
@@ -2397,6 +2400,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  2      8     0.50    *                   {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  1      1     0.50                        vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     0.50    *                   vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  1      1     0.50                        vmovshdup	%xmm16, %xmm19 {%k1}
@@ -3268,7 +3273,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     423.00 462.33 421.33 492.50 492.50 44.00  738.33 12.00  44.00  44.00  44.00
+# CHECK-NEXT:  -     423.00 463.00 422.00 493.50 493.50 44.00  739.00 12.00  44.00  44.00  44.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -3940,6 +3945,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -      -      -     vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -      -      -     vmovshdup	%xmm16, %xmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
index 6ff620b0779f3..9748a27848770 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
@@ -171,7 +171,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  2      7     1.00    *                   insertps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   movntdqa	(%rax), %xmm2
 # CHECK-NEXT:  2      4     1.00                        mpsadbw	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      10    1.00    *                   mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        packusdw	%xmm0, %xmm2
@@ -268,7 +268,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     36.67  46.17  22.00  22.00  2.50   49.17   -     2.50   2.50   2.50
+# CHECK-NEXT:  -      -     37.00  46.50  22.00  22.00  2.50   49.50   -     2.50   2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -288,7 +288,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -     0.50   1.00    -     0.50   0.50   0.50   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     insertps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     movntdqa	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     1.50    -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     1.50    -      -      -      -     mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     packusdw	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-ssse3.s
index d034cbd0f6398..82132309c45d4 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-ssse3.s
@@ -154,8 +154,8 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   pmulhrsw	(%rax), %mm2
 # CHECK-NEXT:  1      5     0.50                        pmulhrsw	%xmm0, %xmm2
 # CHECK-NEXT:  2      11    0.50    *                   pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        pshufb	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   pshufb	(%rax), %mm2
+# CHECK-NEXT:  2      1     1.00                        pshufb	%mm0, %mm2
+# CHECK-NEXT:  3      6     1.00    *                   pshufb	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        pshufb	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   pshufb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psignb	%mm0, %mm2
@@ -187,7 +187,7 @@ psignw      (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     30.67  25.67  16.00  16.00   -     55.67   -      -      -      -
+# CHECK-NEXT:  -      -     32.67  25.67  16.00  16.00   -     55.67   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -239,8 +239,8 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     pmulhrsw	(%rax), %mm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     pshufb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     pshufb	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -      -      -     pshufb	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -      -      -     pshufb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -      -      -     pshufb	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -      -      -     pshufb	(%rax), %xmm2
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     psignb	%mm0, %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s
index b2fde3929106a..5ad7397a8ddc3 100644
--- a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s
@@ -467,6 +467,8 @@ vmovdqu64         %zmm16, (%rax) {k1}
 vmovdqu64         %zmm16, %zmm19 {z}{k1}
 vmovdqu64         (%rax), %zmm19 {z}{k1}
 
+vmovntdqa         (%rax), %zmm0
+
 vmovshdup         %zmm16, %zmm19
 vmovshdup         (%rax), %zmm19
 vmovshdup         %zmm16, %zmm19 {k1}
@@ -1497,6 +1499,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  2      12    0.50           *            vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      9     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      8     0.33    *                   vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  1      8     0.33    *                   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19 {%k1}
@@ -2055,7 +2058,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 508.60 13.60  218.33 218.33 48.50  578.60 1.60   48.50  48.50  48.50  1.60   218.33  -
+# CHECK-NEXT: 508.60 13.60  218.67 218.67 48.50  578.60 1.60   48.50  48.50  48.50  1.60   218.67  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -2468,6 +2471,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -     0.50    -      -     0.50   0.50   0.50    -      -      -     vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT: 0.50    -      -      -      -     0.50    -      -      -      -      -      -      -     vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT: 0.50    -     0.33   0.33    -     0.50    -      -      -      -      -     0.33    -     vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmovshdup	%zmm16, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s
index d8c76832d38d3..c1d6d5776e587 100644
--- a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s
@@ -750,6 +750,9 @@ vmovddup          (%rax), %ymm19 {k1}
 vmovddup          %ymm16, %ymm19 {z}{k1}
 vmovddup          (%rax), %ymm19 {z}{k1}
 
+{evex} vmovntdqa  (%rax), %xmm0
+{evex} vmovntdqa  (%rax), %ymm0
+
 vmovshdup         %xmm16, %xmm19
 vmovshdup         (%rax), %xmm19
 vmovshdup         %xmm16, %xmm19 {k1}
@@ -2397,6 +2400,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  2      9     0.33    *                   vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      9     0.33    *                   vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      7     0.33    *                   {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  1      8     0.33    *                   {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  1      1     0.50                        vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  1      7     0.33    *                   vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  1      1     0.50                        vmovshdup	%xmm16, %xmm19 {%k1}
@@ -3269,7 +3274,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 404.53 412.53 328.33 328.33 46.00  797.53 3.20   46.00  46.00  46.00  3.20   328.33  -
+# CHECK-NEXT: 404.53 412.53 329.00 329.00 46.00  797.53 3.20   46.00  46.00  46.00  3.20   329.00  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -3941,6 +3946,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT: 0.33   0.33   0.33   0.33    -     0.33    -      -      -      -      -     0.33    -     vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT: 0.33   0.33   0.33   0.33    -     0.33    -      -      -      -      -     0.33    -     vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  -     0.50    -      -      -     0.50    -      -      -      -      -      -      -     vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  -     0.50    -      -      -     0.50    -      -      -      -      -      -      -     vmovshdup	%xmm16, %xmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
index e05911e4709dc..1c08bb82d4006 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
@@ -1300,8 +1300,8 @@ vzeroupper
 # CHECK-NEXT:  1      2     1.00                        vmovmskps	%ymm0, %ecx
 # CHECK-NEXT:  2      1     1.00           *            vmovntdq	%xmm0, (%rax)
 # CHECK-NEXT:  2      1     1.00           *            vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  2      7     0.50    *                   vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  2      8     0.50    *                   vmovntdqa	(%rax), %ymm2
 # CHECK-NEXT:  2      1     1.00           *            vmovntpd	%xmm0, (%rax)
 # CHECK-NEXT:  2      1     1.00           *            vmovntpd	%ymm0, (%rax)
 # CHECK-NEXT:  2      1     1.00           *            vmovntps	%xmm0, (%rax)
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 338.58 200.58 173.83 173.83 38.00  324.58 6.25   11.33
+# CHECK-NEXT:  -     126.00 339.25 201.25 173.83 173.83 38.00  325.25 6.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2010,8 +2010,8 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vmovmskps	%ymm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntdq	%xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovntdqa	(%rax), %ymm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntpd	%xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntpd	%ymm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntps	%xmm0, (%rax)
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx2.s
index d5f347b15548d..e07b60a985350 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx2.s
@@ -475,7 +475,7 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  5      20    2.00    *                   vgatherqps	%xmm0, (%rax,%ymm1,2), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      7     0.50    *                   vinserti128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm0
+# CHECK-NEXT:  2      8     0.50    *                   vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  2      4     2.00                        vmpsadbw	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  3      11    2.00    *                   vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%ymm0, %ymm2
@@ -776,7 +776,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     110.33 89.33  100.33 100.33 5.00   164.33  -     0.33
+# CHECK-NEXT:  -      -     110.67 89.67  100.33 100.33 5.00   164.67  -     0.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -795,7 +795,7 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     1.33   0.33   2.00   2.00    -     1.33    -      -     vgatherqps	%xmm0, (%rax,%ymm1,2), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vinserti128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     vmpsadbw	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vpabsb	%ymm0, %ymm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
index 0eec7e4cf58e8..307c858051e4a 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
@@ -171,7 +171,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  2      7     1.00    *                   insertps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   movntdqa	(%rax), %xmm2
 # CHECK-NEXT:  2      4     2.00                        mpsadbw	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      10    2.00    *                   mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     1.00                        packusdw	%xmm0, %xmm2
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     38.83  30.33  23.67  23.67  5.00   63.33  0.50   1.67
+# CHECK-NEXT:  -      -     39.17  30.67  23.67  23.67  5.00   63.67  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -286,7 +286,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     insertps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     movntdqa	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     mpsadbw	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     packusdw	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-ssse3.s
index 047974f8a6c71..6bcac9b86dd3e 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-ssse3.s
@@ -154,8 +154,8 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   pmulhrsw	(%rax), %mm2
 # CHECK-NEXT:  1      5     0.50                        pmulhrsw	%xmm0, %xmm2
 # CHECK-NEXT:  2      11    0.50    *                   pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        pshufb	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   pshufb	(%rax), %mm2
+# CHECK-NEXT:  2      1     1.00                        pshufb	%mm0, %mm2
+# CHECK-NEXT:  3      6     1.00    *                   pshufb	(%rax), %mm2
 # CHECK-NEXT:  1      1     1.00                        pshufb	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     1.00    *                   pshufb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psignb	%mm0, %mm2
@@ -185,7 +185,7 @@ psignw      (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     30.67  12.67  16.00  16.00   -     68.67   -      -
+# CHECK-NEXT:  -      -     32.67  12.67  16.00  16.00   -     68.67   -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -237,8 +237,8 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     pmulhrsw	(%rax), %mm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     pshufb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     pshufb	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     pshufb	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     pshufb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     pshufb	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     pshufb	(%rax), %xmm2
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     psignb	%mm0, %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
index 1f352c119f523..6079e177ce61d 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
@@ -1300,8 +1300,8 @@ vzeroupper
 # CHECK-NEXT:  1      2     1.00                        vmovmskps	%ymm0, %ecx
 # CHECK-NEXT:  2      1     1.00           *            vmovntdq	%xmm0, (%rax)
 # CHECK-NEXT:  2      1     1.00           *            vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT:  1      6     0.50    *                   vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  2      7     0.50    *                   vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  2      8     0.50    *                   vmovntdqa	(%rax), %ymm2
 # CHECK-NEXT:  2      1     1.00           *            vmovntpd	%xmm0, (%rax)
 # CHECK-NEXT:  2      1     1.00           *            vmovntpd	%ymm0, (%rax)
 # CHECK-NEXT:  2      1     1.00           *            vmovntps	%xmm0, (%rax)
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 327.92 204.92 173.83 173.83 38.00  320.92 7.25   11.33
+# CHECK-NEXT:  -     126.00 328.58 205.58 173.83 173.83 38.00  321.58 7.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2010,8 +2010,8 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vmovmskps	%ymm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntdq	%xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntdq	%ymm0, (%rax)
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovntdqa	(%rax), %ymm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntpd	%xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntpd	%ymm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovntps	%xmm0, (%rax)
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx2.s
index 0df51fab0bb0b..6e75196b34584 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx2.s
@@ -475,7 +475,7 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  5      20    2.00    *                   vgatherqps	%xmm0, (%rax,%ymm1,2), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      7     0.50    *                   vinserti128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm0
+# CHECK-NEXT:  2      8     0.50    *                   vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  2      4     2.00                        vmpsadbw	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  3      11    2.00    *                   vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%ymm0, %ymm2
@@ -776,7 +776,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     110.33 89.33  100.33 100.33 5.00   164.33  -     0.33
+# CHECK-NEXT:  -      -     110.67 89.67  100.33 100.33 5.00   164.67  -     0.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -795,7 +795,7 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     1.33   0.33   2.00   2.00    -     1.33    -      -     vgatherqps	%xmm0, (%rax,%ymm1,2), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vinserti128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %ymm0
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     vmpsadbw	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vpabsb	%ymm0, %ymm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
index 9c006d4ebb077..4f384dcf35c83 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
@@ -467,6 +467,8 @@ vmovdqu64         %zmm16, (%rax) {k1}
 vmovdqu64         %zmm16, %zmm19 {z}{k1}
 vmovdqu64         (%rax), %zmm19 {z}{k1}
 
+vmovntdqa         (%rax), %zmm0
+
 vmovshdup         %zmm16, %zmm19
 vmovshdup         (%rax), %zmm19
 vmovshdup         %zmm16, %zmm19 {k1}
@@ -1497,6 +1499,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  2      1     1.00           *            vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      1     0.50                        vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      8     0.50    *                   vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19 {%k1}
@@ -2052,7 +2055,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     612.00 352.67 103.67 359.83 359.83 97.00  651.67 6.00   32.33
+# CHECK-NEXT:  -     612.00 353.00 104.00 360.33 360.33 97.00  652.00 6.00   32.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2465,6 +2468,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovshdup	%zmm16, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
index b4b18101a67b8..de4d158e3b60c 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
@@ -750,6 +750,9 @@ vmovddup          (%rax), %ymm19 {k1}
 vmovddup          %ymm16, %ymm19 {z}{k1}
 vmovddup          (%rax), %ymm19 {z}{k1}
 
+{evex} vmovntdqa  (%rax), %xmm0
+{evex} vmovntdqa  (%rax), %ymm0
+
 vmovshdup         %xmm16, %xmm19
 vmovshdup         (%rax), %xmm19
 vmovshdup         %xmm16, %xmm19 {k1}
@@ -2397,6 +2400,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      8     0.50    *                   vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      7     0.50    *                   {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  2      8     0.50    *                   {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  2      7     0.50    *                   vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%xmm16, %xmm19 {%k1}
@@ -3266,7 +3271,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     423.00 462.33 358.33 521.83 521.83 88.00  801.33 12.00  29.33
+# CHECK-NEXT:  -     423.00 463.00 359.00 522.83 522.83 88.00  802.00 12.00  29.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -3938,6 +3943,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vmovshdup	%xmm16, %xmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
index e3f34fdc9430d..c9d3a8e40b652 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
@@ -171,7 +171,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  2      7     1.00    *                   insertps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50    *                   movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  2      7     0.50    *                   movntdqa	(%rax), %xmm2
 # CHECK-NEXT:  2      4     2.00                        mpsadbw	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      10    2.00    *                   mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     1.00                        packusdw	%xmm0, %xmm2
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     38.17  29.67  23.67  23.67  5.00   64.67  0.50   1.67
+# CHECK-NEXT:  -      -     38.50  30.00  23.67  23.67  5.00   65.00  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -286,7 +286,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     insertps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -     movntdqa	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     mpsadbw	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     packusdw	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-ssse3.s
index c8b5cbb952451..193d38594bb29 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-ssse3.s
@@ -154,8 +154,8 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   pmulhrsw	(%rax), %mm2
 # CHECK-NEXT:  1      5     0.50                        pmulhrsw	%xmm0, %xmm2
 # CHECK-NEXT:  2      11    0.50    *                   pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        pshufb	%mm0, %mm2
-# CHECK-NEXT:  2      6     1.00    *                   pshufb	(%rax), %mm2
+# CHECK-NEXT:  2      1     1.00                        pshufb	%mm0, %mm2
+# CHECK-NEXT:  3      6     1.00    *                   pshufb	(%rax), %mm2
 # CHECK-NEXT:  1      1     1.00                        pshufb	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     1.00    *                   pshufb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psignb	%mm0, %mm2
@@ -185,7 +185,7 @@ psignw      (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     30.67  12.67  16.00  16.00   -     68.67   -      -
+# CHECK-NEXT:  -      -     32.67  12.67  16.00  16.00   -     68.67   -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -237,8 +237,8 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     pmulhrsw	(%rax), %mm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     pmulhrsw	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     pshufb	%mm0, %mm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     pshufb	(%rax), %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     pshufb	%mm0, %mm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     pshufb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     pshufb	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     pshufb	(%rax), %xmm2
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     psignb	%mm0, %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
index 0c4c656768001..c8f07e94ad6b7 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
@@ -467,6 +467,8 @@ vmovdqu64         %zmm16, (%rax) {k1}
 vmovdqu64         %zmm16, %zmm19 {z}{k1}
 vmovdqu64         (%rax), %zmm19 {z}{k1}
 
+vmovntdqa         (%rax), %zmm0
+
 vmovshdup         %zmm16, %zmm19
 vmovshdup         (%rax), %zmm19
 vmovshdup         %zmm16, %zmm19 {k1}
@@ -1497,6 +1499,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      1     1.00           *            vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      0     0.17                        vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      8     0.50    *                   vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vmovshdup	%zmm16, %zmm19 {%k1}
@@ -2065,7 +2068,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 5.33   5.33   5.33    -      -      -      -      -     219.50 1059.00 616.50 351.00 297.00 297.00 17.00 205.33 205.33 205.33 194.33 194.33 194.33 16.50  16.50
+# CHECK-NEXT: 5.33   5.33   5.33    -      -      -      -      -     219.50 1059.00 616.50 351.00 297.50 297.50 17.00 205.67 205.67 205.67 194.67 194.67 194.67 16.50  16.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2478,6 +2481,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   1.00   0.33   0.33   0.33    -      -      -     0.50   0.50   vmovdqu64	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovdqu64	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vmovdqu64	(%rax), %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vmovntdqa	(%rax), %zmm0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vmovshdup	%zmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vmovshdup	(%rax), %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vmovshdup	%zmm16, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
index 4636e23d9df3e..5565eb740a1c6 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
@@ -750,6 +750,9 @@ vmovddup          (%rax), %ymm19 {k1}
 vmovddup          %ymm16, %ymm19 {z}{k1}
 vmovddup          (%rax), %ymm19 {z}{k1}
 
+{evex} vmovntdqa  (%rax), %xmm0
+{evex} vmovntdqa  (%rax), %ymm0
+
 vmovshdup         %xmm16, %xmm19
 vmovshdup         (%rax), %xmm19
 vmovshdup         %xmm16, %xmm19 {k1}
@@ -2397,6 +2400,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      8     0.50    *                   vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      2     1.00                        vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      8     0.50    *                   {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  1      8     0.50    *                   {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  1      1     0.50                        vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  1      1     0.50                        vmovshdup	%xmm16, %xmm19 {%k1}
@@ -3279,7 +3284,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     208.00 948.00 501.50 261.50 478.50 478.50 32.00  335.00 335.00 335.00 313.67 313.67 313.67 32.00  32.00
+# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     208.00 948.00 501.50 261.50 479.50 479.50 32.00  335.67 335.67 335.67 314.33 314.33 314.33 32.00  32.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -3951,6 +3956,8 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vmovddup	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vmovddup	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vmovddup	(%rax), %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     {evex}	vmovntdqa	(%rax), %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     {evex}	vmovntdqa	(%rax), %ymm0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     vmovshdup	%xmm16, %xmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vmovshdup	(%rax), %xmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     vmovshdup	%xmm16, %xmm19 {%k1}
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s
new file mode 100644
index 0000000000000..5b9d42c7fad55
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s
@@ -0,0 +1,52 @@
+;; Test disassembly for gfx950 kernel descriptor.
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1.s > 1.o
+; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1-disasm.s > 1-disasm.o
+; FIxMe: cmp 1.o 1-disasm.o
+; CHECK: .amdhsa_kernel kernel
+; CHECK-NEXT:	.amdhsa_group_segment_fixed_size 163840
+; CHECK-NEXT:	.amdhsa_private_segment_fixed_size 0
+; CHECK-NEXT:	.amdhsa_kernarg_size 0
+; CHECK-NEXT:	.amdhsa_accum_offset 4
+; CHECK-NEXT:	.amdhsa_tg_split 0
+; CHECK-NEXT:	.amdhsa_next_free_vgpr 8
+; CHECK-NEXT:	.amdhsa_reserve_vcc 0
+; CHECK-NEXT:	.amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT:	.amdhsa_next_free_sgpr 8
+; CHECK-NEXT:	.amdhsa_float_round_mode_32 0
+; CHECK-NEXT:	.amdhsa_float_round_mode_16_64 0
+; CHECK-NEXT:	.amdhsa_float_denorm_mode_32 0
+; CHECK-NEXT:	.amdhsa_float_denorm_mode_16_64 3
+; CHECK-NEXT:	.amdhsa_dx10_clamp 1
+; CHECK-NEXT:	.amdhsa_ieee_mode 1
+; CHECK-NEXT:	.amdhsa_fp16_overflow 0
+; CHECK-NEXT:	.amdhsa_enable_private_segment 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_x 1
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_y 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_z 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_info 0
+; CHECK-NEXT:	.amdhsa_system_vgpr_workitem_id 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_invalid_op 0
+; CHECK-NEXT:	.amdhsa_exception_fp_denorm_src 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_div_zero 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_overflow 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_underflow 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_inexact 0
+; CHECK-NEXT:	.amdhsa_exception_int_div_zero 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_dispatch_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_queue_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_kernarg_segment_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_dispatch_id 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT:	.amdhsa_uses_dynamic_stack 0
+; CHECK-NEXT:.end_amdhsa_kernel
+.amdhsa_kernel kernel
+  .amdhsa_group_segment_fixed_size 163840
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
index 45071ecb75132..8d5307372a303 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
@@ -137,7 +137,6 @@ define amdgpu_kernel void @test_kernel() {
 
 ; ----------------------------------GFX9---------------------------------------
 ;
-
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-4-generic -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-4-generic %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
@@ -148,6 +147,11 @@ define amdgpu_kernel void @test_kernel() {
 ; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
 ; RUN: diff %t-specify.txt %t-detect.txt
 
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx950 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx942 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
index 34c22dca3aa18..7de64a6edfe2e 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
@@ -223,6 +223,15 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX942
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX942 -DFLAG_VALUE=0x4C
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F
+
+# RUN: yaml2obj %s -o %t -DABI_VERSION=1 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=1 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F
+
+# RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1010
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1010 -DFLAG_VALUE=0x33
 
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 0a70321fab781..3c3bff76fb681 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -537,6 +537,8 @@ struct ConstantInliner {
   std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
                                                  unsigned Value);
 
+  std::vector<MCInst> loadDirectionFlagAndFinalize();
+
 private:
   ConstantInliner &add(const MCInst &Inst) {
     Instructions.push_back(Inst);
@@ -612,6 +614,15 @@ ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
   return std::move(Instructions);
 }
 
+std::vector<MCInst> ConstantInliner::loadDirectionFlagAndFinalize() {
+  if (Constant_.isZero())
+    add(MCInstBuilder(X86::CLD));
+  else if (Constant_.isOne())
+    add(MCInstBuilder(X86::STD));
+
+  return std::move(Instructions);
+}
+
 void ConstantInliner::initStack(unsigned Bytes) {
   assert(Constant_.getBitWidth() <= Bytes * 8 &&
          "Value does not have the correct size");
@@ -1089,6 +1100,8 @@ std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
         0x1f80);
   if (Reg == X86::FPCW)
     return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
+  if (Reg == X86::DF)
+    return CI.loadDirectionFlagAndFinalize();
   return {}; // Not yet implemented.
 }
 
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 26a888c628d9d..104d802b1e1ee 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -58,6 +58,7 @@ class ObjcopyOptTable : public opt::GenericOptTable {
 public:
   ObjcopyOptTable() : opt::GenericOptTable(objcopy_opt::ObjcopyInfoTable) {
     setGroupedShortOptions(true);
+    setDashDashParsing(true);
   }
 };
 
@@ -650,17 +651,11 @@ parseChangeSectionAddr(StringRef ArgValue, StringRef OptionName,
 // help flag is set then parseObjcopyOptions will print the help messege and
 // exit.
 Expected<DriverConfig>
-objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
+objcopy::parseObjcopyOptions(ArrayRef<const char *> ArgsArr,
                              function_ref<Error(Error)> ErrorCallback) {
   DriverConfig DC;
   ObjcopyOptTable T;
 
-  const char *const *DashDash =
-      llvm::find_if(RawArgsArr, [](StringRef Str) { return Str == "--"; });
-  ArrayRef<const char *> ArgsArr = ArrayRef(RawArgsArr.begin(), DashDash);
-  if (DashDash != RawArgsArr.end())
-    DashDash = std::next(DashDash);
-
   unsigned MissingArgumentIndex, MissingArgumentCount;
   llvm::opt::InputArgList InputArgs =
       T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
@@ -671,7 +666,7 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
         "argument to '%s' is missing (expected %d value(s))",
         InputArgs.getArgString(MissingArgumentIndex), MissingArgumentCount);
 
-  if (InputArgs.size() == 0 && DashDash == RawArgsArr.end()) {
+  if (InputArgs.size() == 0) {
     printHelp(T, errs(), ToolType::Objcopy);
     exit(1);
   }
@@ -695,7 +690,6 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
 
   for (auto *Arg : InputArgs.filtered(OBJCOPY_INPUT))
     Positional.push_back(Arg->getValue());
-  std::copy(DashDash, RawArgsArr.end(), std::back_inserter(Positional));
 
   if (Positional.empty())
     return createStringError(errc::invalid_argument, "no input file specified");
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 8a42e430fb54e..7641a80129de3 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -375,7 +375,7 @@ cl::opt<unsigned long long> OverlapValueCutoff(
     cl::desc(
         "Function level overlap information for every function (with calling "
         "context for csspgo) in test "
-        "profile with max count value greater then the parameter value"),
+        "profile with max count value greater than the parameter value"),
     cl::sub(OverlapSubcommand));
 
 // Options specific to show subcommand.
@@ -720,33 +720,7 @@ loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
                               Filename);
     };
 
-    // Add the frame mappings into the writer context.
-    const auto &IdToFrame = Reader->getFrameMapping();
-    for (const auto &I : IdToFrame) {
-      bool Succeeded = WC->Writer.addMemProfFrame(
-          /*Id=*/I.first, /*Frame=*/I.getSecond(), MemProfError);
-      // If we weren't able to add the frame mappings then it doesn't make sense
-      // to try to add the records from this profile.
-      if (!Succeeded)
-        return;
-    }
-
-    // Add the call stacks into the writer context.
-    const auto &CSIdToCallStacks = Reader->getCallStacks();
-    for (const auto &I : CSIdToCallStacks) {
-      bool Succeeded = WC->Writer.addMemProfCallStack(
-          /*Id=*/I.first, /*Frame=*/I.getSecond(), MemProfError);
-      // If we weren't able to add the call stacks then it doesn't make sense
-      // to try to add the records from this profile.
-      if (!Succeeded)
-        return;
-    }
-
-    const auto &FunctionProfileData = Reader->getProfileData();
-    // Add the memprof records into the writer context.
-    for (const auto &[GUID, Record] : FunctionProfileData) {
-      WC->Writer.addMemProfRecord(GUID, Record);
-    }
+    WC->Writer.addMemProfData(Reader->takeMemProfData(), MemProfError);
     return;
   }
 
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 1012cd020d525..bb8ec41d87454 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1619,6 +1619,7 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"),                            \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"),                            \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX950, "gfx950"),                            \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"),                          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1011, "gfx1011"),                          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1012, "gfx1012"),                          \
diff --git a/llvm/unittests/Analysis/InlineAdvisorPlugin/InlineAdvisorPlugin.cpp b/llvm/unittests/Analysis/InlineAdvisorPlugin/InlineAdvisorPlugin.cpp
index 6431fda86c9dc..beefff2b3b106 100644
--- a/llvm/unittests/Analysis/InlineAdvisorPlugin/InlineAdvisorPlugin.cpp
+++ b/llvm/unittests/Analysis/InlineAdvisorPlugin/InlineAdvisorPlugin.cpp
@@ -17,32 +17,16 @@ InlineAdvisor *DefaultAdvisorFactory(Module &M, FunctionAnalysisManager &FAM,
   return new DefaultInlineAdvisor(M, FAM, Params, IC);
 }
 
-struct DefaultDynamicAdvisor : PassInfoMixin<DefaultDynamicAdvisor> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) {
-    PluginInlineAdvisorAnalysis DA(DefaultAdvisorFactory);
-    MAM.registerPass([&] { return DA; });
-    return PreservedAnalyses::all();
-  }
-};
-
 } // namespace
 
 /* New PM Registration */
 llvm::PassPluginLibraryInfo getDefaultDynamicAdvisorPluginInfo() {
   return {LLVM_PLUGIN_API_VERSION, "DynamicDefaultAdvisor", LLVM_VERSION_STRING,
           [](PassBuilder &PB) {
-            PB.registerPipelineStartEPCallback(
-                [](ModulePassManager &MPM, OptimizationLevel Level) {
-                  MPM.addPass(DefaultDynamicAdvisor());
-                });
-            PB.registerPipelineParsingCallback(
-                [](StringRef Name, ModulePassManager &PM,
-                   ArrayRef<PassBuilder::PipelineElement> InnerPipeline) {
-                  if (Name == "dynamic-inline-advisor") {
-                    PM.addPass(DefaultDynamicAdvisor());
-                    return true;
-                  }
-                  return false;
+            PB.registerAnalysisRegistrationCallback(
+                [](ModuleAnalysisManager &MAM) {
+                  PluginInlineAdvisorAnalysis PA(DefaultAdvisorFactory);
+                  MAM.registerPass([&] { return PA; });
                 });
           }};
 }
diff --git a/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp b/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
index 3330751120e6c..ca4ea8b627e83 100644
--- a/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/PluginInlineAdvisorAnalysisTest.cpp
@@ -66,8 +66,6 @@ struct CompilerInstance {
     Expected<PassPlugin> Plugin = PassPlugin::Load(PluginPath);
     ASSERT_TRUE(!!Plugin) << "Plugin path: " << PluginPath;
     Plugin->registerPassBuilderCallbacks(PB);
-    ASSERT_THAT_ERROR(PB.parsePassPipeline(MPM, "dynamic-inline-advisor"),
-                      Succeeded());
   }
 
   // connect the FooOnlyInlineAdvisor to our compiler instance
@@ -87,33 +85,10 @@ struct CompilerInstance {
                                   ThinOrFullLTOPhase::None));
   }
 
-  ~CompilerInstance() {
-    // Reset the static variable that tracks if the plugin has been registered.
-    // This is needed to allow the test to run multiple times.
-    PluginInlineAdvisorAnalysis::HasBeenRegistered = false;
-  }
-
   std::string output;
   std::unique_ptr<Module> outputM;
 
-  // run with the default inliner
-  auto run_default(StringRef IR) {
-    PluginInlineAdvisorAnalysis::HasBeenRegistered = false;
-    outputM = parseAssemblyString(IR, Error, Ctx);
-    MPM.run(*outputM, MAM);
-    ASSERT_TRUE(outputM);
-    output.clear();
-    raw_string_ostream o_stream{output};
-    outputM->print(o_stream, nullptr);
-    ASSERT_TRUE(true);
-  }
-
-  // run with the dnamic inliner
-  auto run_dynamic(StringRef IR) {
-    // note typically the constructor for the DynamicInlineAdvisorAnalysis
-    // will automatically set this to true, we controll it here only to
-    // altenate between the default and dynamic inliner in our test
-    PluginInlineAdvisorAnalysis::HasBeenRegistered = true;
+  auto run(StringRef IR) {
     outputM = parseAssemblyString(IR, Error, Ctx);
     MPM.run(*outputM, MAM);
     ASSERT_TRUE(outputM);
@@ -274,14 +249,16 @@ TEST(PluginInlineAdvisorTest, PluginLoad) {
   // Skip the test if plugins are disabled.
   GTEST_SKIP();
 #endif
-  CompilerInstance CI{};
-  CI.setupPlugin();
+  CompilerInstance DefaultCI{};
+
+  CompilerInstance PluginCI{};
+  PluginCI.setupPlugin();
 
   for (StringRef IR : TestIRS) {
-    CI.run_default(IR);
-    std::string default_output = CI.output;
-    CI.run_dynamic(IR);
-    std::string dynamic_output = CI.output;
+    DefaultCI.run(IR);
+    std::string default_output = DefaultCI.output;
+    PluginCI.run(IR);
+    std::string dynamic_output = PluginCI.output;
     ASSERT_EQ(default_output, dynamic_output);
   }
 }
@@ -294,7 +271,7 @@ TEST(PluginInlineAdvisorTest, CustomAdvisor) {
   CI.setupFooOnly();
 
   for (StringRef IR : TestIRS) {
-    CI.run_dynamic(IR);
+    CI.run(IR);
     CallGraph CGraph = CallGraph(*CI.outputM);
     for (auto &node : CGraph) {
       for (auto &edge : *node.second) {
diff --git a/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp b/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
index ca860a0dd5584..0b31b0892d75a 100644
--- a/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/PluginInlineOrderAnalysisTest.cpp
@@ -61,12 +61,6 @@ struct CompilerInstance {
                                   ThinOrFullLTOPhase::None));
   }
 
-  ~CompilerInstance() {
-    // Reset the static variable that tracks if the plugin has been registered.
-    // This is needed to allow the test to run multiple times.
-    PluginInlineOrderAnalysis::unregister();
-  }
-
   std::string Output;
   std::unique_ptr<Module> OutputM;
 
diff --git a/llvm/unittests/FuzzMutate/OperationsTest.cpp b/llvm/unittests/FuzzMutate/OperationsTest.cpp
index bc972ad21d049..b4a7bb70f328f 100644
--- a/llvm/unittests/FuzzMutate/OperationsTest.cpp
+++ b/llvm/unittests/FuzzMutate/OperationsTest.cpp
@@ -261,7 +261,7 @@ TEST(OperationsTest, SplitBlock) {
   // Create a block with only a return and split it on the return.
   auto *BB = BasicBlock::Create(Ctx, "BB", F);
   auto *RI = ReturnInst::Create(Ctx, BB);
-  SBOp.BuilderFunc({UndefValue::get(Type::getInt1Ty(Ctx))}, RI->getIterator());
+  SBOp.BuilderFunc({PoisonValue::get(Type::getInt1Ty(Ctx))}, RI->getIterator());
 
   // We should end up with an unconditional branch from BB to BB1, and the
   // return ends up in BB1.
@@ -368,11 +368,11 @@ TEST(OperationsTest, GEP) {
   auto *RI = ReturnInst::Create(Ctx, BB);
 
   auto GEPOp = fuzzerop::gepDescriptor(1);
-  EXPECT_TRUE(GEPOp.SourcePreds[0].matches({}, UndefValue::get(Int8PtrTy)));
-  EXPECT_TRUE(GEPOp.SourcePreds[1].matches({UndefValue::get(Int8PtrTy)},
+  EXPECT_TRUE(GEPOp.SourcePreds[0].matches({}, PoisonValue::get(Int8PtrTy)));
+  EXPECT_TRUE(GEPOp.SourcePreds[1].matches({PoisonValue::get(Int8PtrTy)},
                                            ConstantInt::get(Int32Ty, 0)));
 
-  GEPOp.BuilderFunc({UndefValue::get(Int8PtrTy), ConstantInt::get(Int32Ty, 0)},
+  GEPOp.BuilderFunc({PoisonValue::get(Int8PtrTy), ConstantInt::get(Int32Ty, 0)},
                     RI->getIterator());
   EXPECT_FALSE(verifyModule(M, &errs()));
 }
@@ -419,11 +419,11 @@ TEST(OperationsTest, ExtractAndInsertValue) {
   auto IVOp = fuzzerop::insertValueDescriptor(1);
 
   // Sanity check the source preds.
-  Constant *SVal = UndefValue::get(StructTy);
-  Constant *OVal = UndefValue::get(OpaqueTy);
-  Constant *AVal = UndefValue::get(ArrayTy);
-  Constant *ZAVal = UndefValue::get(ZeroSizedArrayTy);
-  Constant *VVal = UndefValue::get(VectorTy);
+  Constant *SVal = PoisonValue::get(StructTy);
+  Constant *OVal = PoisonValue::get(OpaqueTy);
+  Constant *AVal = PoisonValue::get(ArrayTy);
+  Constant *ZAVal = PoisonValue::get(ZeroSizedArrayTy);
+  Constant *VVal = PoisonValue::get(VectorTy);
 
   EXPECT_TRUE(EVOp.SourcePreds[0].matches({}, SVal));
   EXPECT_FALSE(EVOp.SourcePreds[0].matches({}, OVal));
@@ -462,12 +462,12 @@ TEST(OperationsTest, ExtractAndInsertValue) {
 
   // InsertValue should accept any type in the struct, but only in positions
   // where it makes sense.
-  EXPECT_TRUE(IVOp.SourcePreds[1].matches({SVal}, UndefValue::get(Int8PtrTy)));
-  EXPECT_TRUE(IVOp.SourcePreds[1].matches({SVal}, UndefValue::get(Int32Ty)));
-  EXPECT_FALSE(IVOp.SourcePreds[1].matches({SVal}, UndefValue::get(Int64Ty)));
-  EXPECT_FALSE(IVOp.SourcePreds[2].matches({SVal, UndefValue::get(Int32Ty)},
+  EXPECT_TRUE(IVOp.SourcePreds[1].matches({SVal}, PoisonValue::get(Int8PtrTy)));
+  EXPECT_TRUE(IVOp.SourcePreds[1].matches({SVal}, PoisonValue::get(Int32Ty)));
+  EXPECT_FALSE(IVOp.SourcePreds[1].matches({SVal}, PoisonValue::get(Int64Ty)));
+  EXPECT_FALSE(IVOp.SourcePreds[2].matches({SVal, PoisonValue::get(Int32Ty)},
                                            ConstantInt::get(Int32Ty, 0)));
-  EXPECT_TRUE(IVOp.SourcePreds[2].matches({SVal, UndefValue::get(Int32Ty)},
+  EXPECT_TRUE(IVOp.SourcePreds[2].matches({SVal, PoisonValue::get(Int32Ty)},
                                           ConstantInt::get(Int32Ty, 1)));
 
   EXPECT_THAT(IVOp.SourcePreds[1].generate({SVal}, {}),
diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
index fbdab1975df72..628221339c89b 100644
--- a/llvm/unittests/IR/MetadataTest.cpp
+++ b/llvm/unittests/IR/MetadataTest.cpp
@@ -3541,12 +3541,12 @@ TEST_F(DIExpressionTest, Fold) {
   ResExpr = DIExpression::get(Context, ResOps);
   EXPECT_EQ(E, ResExpr);
 
-  // Test a left shift greater than 64.
+  // Test a left shift greater than 63.
   Ops.clear();
   Ops.push_back(dwarf::DW_OP_constu);
   Ops.push_back(1);
   Ops.push_back(dwarf::DW_OP_constu);
-  Ops.push_back(65);
+  Ops.push_back(64);
   Ops.push_back(dwarf::DW_OP_shl);
   Expr = DIExpression::get(Context, Ops);
   E = Expr->foldConstantMath();
@@ -3554,17 +3554,17 @@ TEST_F(DIExpressionTest, Fold) {
   ResOps.push_back(dwarf::DW_OP_constu);
   ResOps.push_back(1);
   ResOps.push_back(dwarf::DW_OP_constu);
-  ResOps.push_back(65);
+  ResOps.push_back(64);
   ResOps.push_back(dwarf::DW_OP_shl);
   ResExpr = DIExpression::get(Context, ResOps);
   EXPECT_EQ(E, ResExpr);
 
-  // Test a right shift greater than 64.
+  // Test a right shift greater than 63.
   Ops.clear();
   Ops.push_back(dwarf::DW_OP_constu);
   Ops.push_back(1);
   Ops.push_back(dwarf::DW_OP_constu);
-  Ops.push_back(65);
+  Ops.push_back(64);
   Ops.push_back(dwarf::DW_OP_shr);
   Expr = DIExpression::get(Context, Ops);
   E = Expr->foldConstantMath();
@@ -3572,7 +3572,7 @@ TEST_F(DIExpressionTest, Fold) {
   ResOps.push_back(dwarf::DW_OP_constu);
   ResOps.push_back(1);
   ResOps.push_back(dwarf::DW_OP_constu);
-  ResOps.push_back(65);
+  ResOps.push_back(64);
   ResOps.push_back(dwarf::DW_OP_shr);
   ResExpr = DIExpression::get(Context, ResOps);
   EXPECT_EQ(E, ResExpr);
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 582efad531bf7..8bd39fd71266a 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -21,11 +21,14 @@
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 #include <cstdarg>
+#include <initializer_list>
 #include <optional>
 
 using namespace llvm;
+using ::llvm::memprof::LineLocation;
 using ::testing::EndsWith;
 using ::testing::IsSubsetOf;
+using ::testing::Pair;
 using ::testing::SizeIs;
 using ::testing::UnorderedElementsAre;
 
@@ -348,10 +351,10 @@ TEST_F(InstrProfTest, test_merge_traces_sampled) {
 using ::llvm::memprof::IndexedMemProfRecord;
 using ::llvm::memprof::MemInfoBlock;
 using FrameIdMapTy =
-    llvm::DenseMap<::llvm::memprof::FrameId, ::llvm::memprof::Frame>;
+    llvm::MapVector<::llvm::memprof::FrameId, ::llvm::memprof::Frame>;
 using CallStackIdMapTy =
-    llvm::DenseMap<::llvm::memprof::CallStackId,
-                   ::llvm::SmallVector<::llvm::memprof::FrameId>>;
+    llvm::MapVector<::llvm::memprof::CallStackId,
+                    ::llvm::SmallVector<::llvm::memprof::FrameId>>;
 
 static FrameIdMapTy getFrameMapping() {
   FrameIdMapTy Mapping;
@@ -414,8 +417,7 @@ makeRecordV2(std::initializer_list<::llvm::memprof::CallStackId> AllocFrames,
   for (const auto &CSId : AllocFrames)
     // We don't populate IndexedAllocationInfo::CallStack because we use it only
     // in Version1.
-    MR.AllocSites.emplace_back(::llvm::SmallVector<memprof::FrameId>(), CSId,
-                               Block, Schema);
+    MR.AllocSites.emplace_back(CSId, Block, Schema);
   for (const auto &CSId : CallSiteFrames)
     MR.CallSiteIds.push_back(CSId);
   return MR;
@@ -467,11 +469,11 @@ TEST_F(InstrProfTest, test_memprof_v0) {
       /*CallSiteFrames=*/{
           {4, 5},
       });
-  const FrameIdMapTy IdToFrameMap = getFrameMapping();
-  for (const auto &I : IdToFrameMap) {
-    Writer.addMemProfFrame(I.first, I.getSecond(), Err);
-  }
-  Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR);
+
+  memprof::IndexedMemProfData MemProfData;
+  MemProfData.Frames = getFrameMapping();
+  MemProfData.Records.try_emplace(0x9999, IndexedMR);
+  Writer.addMemProfData(MemProfData, Err);
 
   auto Profile = Writer.writeBuffer();
   readProfile(std::move(Profile));
@@ -482,8 +484,8 @@ TEST_F(InstrProfTest, test_memprof_v0) {
 
   std::optional<memprof::FrameId> LastUnmappedFrameId;
   auto IdToFrameCallback = [&](const memprof::FrameId Id) {
-    auto Iter = IdToFrameMap.find(Id);
-    if (Iter == IdToFrameMap.end()) {
+    auto Iter = MemProfData.Frames.find(Id);
+    if (Iter == MemProfData.Frames.end()) {
       LastUnmappedFrameId = Id;
       return memprof::Frame(0, 0, 0, false);
     }
@@ -508,15 +510,11 @@ TEST_F(InstrProfTest, test_memprof_v2_full_schema) {
   const IndexedMemProfRecord IndexedMR = makeRecordV2(
       /*AllocFrames=*/{0x111, 0x222},
       /*CallSiteFrames=*/{0x333}, MIB, memprof::getFullSchema());
-  const FrameIdMapTy IdToFrameMap = getFrameMapping();
-  const auto CSIdToCallStackMap = getCallStackMapping();
-  for (const auto &I : IdToFrameMap) {
-    Writer.addMemProfFrame(I.first, I.getSecond(), Err);
-  }
-  for (const auto &I : CSIdToCallStackMap) {
-    Writer.addMemProfCallStack(I.first, I.getSecond(), Err);
-  }
-  Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR);
+  memprof::IndexedMemProfData MemProfData;
+  MemProfData.Frames = getFrameMapping();
+  MemProfData.CallStacks = getCallStackMapping();
+  MemProfData.Records.try_emplace(0x9999, IndexedMR);
+  Writer.addMemProfData(MemProfData, Err);
 
   auto Profile = Writer.writeBuffer();
   readProfile(std::move(Profile));
@@ -525,9 +523,10 @@ TEST_F(InstrProfTest, test_memprof_v2_full_schema) {
   ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded());
   const memprof::MemProfRecord &Record = RecordOr.get();
 
-  memprof::FrameIdConverter<decltype(IdToFrameMap)> FrameIdConv(IdToFrameMap);
-  memprof::CallStackIdConverter<decltype(CSIdToCallStackMap)> CSIdConv(
-      CSIdToCallStackMap, FrameIdConv);
+  memprof::FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
+      MemProfData.Frames);
+  memprof::CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
+      MemProfData.CallStacks, FrameIdConv);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
       IndexedMR.toMemProfRecord(CSIdConv);
@@ -550,15 +549,11 @@ TEST_F(InstrProfTest, test_memprof_v2_partial_schema) {
   const IndexedMemProfRecord IndexedMR = makeRecordV2(
       /*AllocFrames=*/{0x111, 0x222},
       /*CallSiteFrames=*/{0x333}, MIB, memprof::getHotColdSchema());
-  const FrameIdMapTy IdToFrameMap = getFrameMapping();
-  const auto CSIdToCallStackMap = getCallStackMapping();
-  for (const auto &I : IdToFrameMap) {
-    Writer.addMemProfFrame(I.first, I.getSecond(), Err);
-  }
-  for (const auto &I : CSIdToCallStackMap) {
-    Writer.addMemProfCallStack(I.first, I.getSecond(), Err);
-  }
-  Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR);
+  memprof::IndexedMemProfData MemProfData;
+  MemProfData.Frames = getFrameMapping();
+  MemProfData.CallStacks = getCallStackMapping();
+  MemProfData.Records.try_emplace(0x9999, IndexedMR);
+  Writer.addMemProfData(MemProfData, Err);
 
   auto Profile = Writer.writeBuffer();
   readProfile(std::move(Profile));
@@ -567,9 +562,10 @@ TEST_F(InstrProfTest, test_memprof_v2_partial_schema) {
   ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded());
   const memprof::MemProfRecord &Record = RecordOr.get();
 
-  memprof::FrameIdConverter<decltype(IdToFrameMap)> FrameIdConv(IdToFrameMap);
-  memprof::CallStackIdConverter<decltype(CSIdToCallStackMap)> CSIdConv(
-      CSIdToCallStackMap, FrameIdConv);
+  memprof::FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
+      MemProfData.Frames);
+  memprof::CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
+      MemProfData.CallStacks, FrameIdConv);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
       IndexedMR.toMemProfRecord(CSIdConv);
@@ -601,23 +597,21 @@ TEST_F(InstrProfTest, test_caller_callee_pairs) {
   //       Line: 7, Column: 8
   //         new(...)
 
-  const std::pair<memprof::FrameId, memprof::Frame> Frames[] = {
-      {0, {0x123, 1, 2, false}},
-      {1, {0x234, 3, 4, true}},
-      {2, {0x123, 5, 6, false}},
-      {3, {0x345, 7, 8, true}}};
-  for (const auto &[FrameId, Frame] : Frames)
-    Writer.addMemProfFrame(FrameId, Frame, Err);
-
-  const std::pair<memprof::CallStackId, SmallVector<memprof::FrameId>>
-      CallStacks[] = {{0x111, {1, 0}}, {0x222, {3, 2}}};
-  for (const auto &[CSId, CallStack] : CallStacks)
-    Writer.addMemProfCallStack(CSId, CallStack, Err);
-
   const IndexedMemProfRecord IndexedMR = makeRecordV2(
       /*AllocFrames=*/{0x111, 0x222},
       /*CallSiteFrames=*/{}, MIB, memprof::getHotColdSchema());
-  Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR);
+
+  memprof::IndexedMemProfData MemProfData;
+  MemProfData.Frames.try_emplace(0, 0x123, 1, 2, false);
+  MemProfData.Frames.try_emplace(1, 0x234, 3, 4, true);
+  MemProfData.Frames.try_emplace(2, 0x123, 5, 6, false);
+  MemProfData.Frames.try_emplace(3, 0x345, 7, 8, true);
+  MemProfData.CallStacks.try_emplace(
+      0x111, std::initializer_list<memprof::FrameId>{1, 0});
+  MemProfData.CallStacks.try_emplace(
+      0x222, std::initializer_list<memprof::FrameId>{3, 2});
+  MemProfData.Records.try_emplace(0x9999, IndexedMR);
+  Writer.addMemProfData(MemProfData, Err);
 
   auto Profile = Writer.writeBuffer();
   readProfile(std::move(Profile));
@@ -628,47 +622,31 @@ TEST_F(InstrProfTest, test_caller_callee_pairs) {
   auto It = Pairs.find(0x123);
   ASSERT_NE(It, Pairs.end());
   ASSERT_THAT(It->second, SizeIs(2));
-  EXPECT_THAT(It->second[0], testing::Pair(testing::FieldsAre(1U, 2U), 0x234U));
-  EXPECT_THAT(It->second[1], testing::Pair(testing::FieldsAre(5U, 6U), 0x345U));
+  EXPECT_THAT(It->second[0], Pair(LineLocation(1, 2), 0x234U));
+  EXPECT_THAT(It->second[1], Pair(LineLocation(5, 6), 0x345U));
 
   It = Pairs.find(0x234);
   ASSERT_NE(It, Pairs.end());
   ASSERT_THAT(It->second, SizeIs(1));
-  EXPECT_THAT(It->second[0], testing::Pair(testing::FieldsAre(3U, 4U), 0U));
+  EXPECT_THAT(It->second[0], Pair(LineLocation(3, 4), 0U));
 
   It = Pairs.find(0x345);
   ASSERT_NE(It, Pairs.end());
   ASSERT_THAT(It->second, SizeIs(1));
-  EXPECT_THAT(It->second[0], testing::Pair(testing::FieldsAre(7U, 8U), 0U));
+  EXPECT_THAT(It->second[0], Pair(LineLocation(7, 8), 0U));
 }
 
 TEST_F(InstrProfTest, test_memprof_getrecord_error) {
   ASSERT_THAT_ERROR(Writer.mergeProfileKind(InstrProfKind::MemProf),
                     Succeeded());
 
-  const IndexedMemProfRecord IndexedMR = makeRecord(
-      /*AllocFrames=*/
-      {
-          {0, 1},
-          {2, 3},
-      },
-      /*CallSiteFrames=*/{
-          {4, 5},
-      });
-  // We skip adding the frame mappings here unlike the test_memprof unit test
-  // above to exercise the failure path when getMemProfRecord is invoked.
-  Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR);
-
+  Writer.setMemProfVersionRequested(memprof::Version3);
+  // Generate an empty profile.
   auto Profile = Writer.writeBuffer();
   readProfile(std::move(Profile));
 
-  // Missing frames give a hash_mismatch error.
-  auto RecordOr = Reader->getMemProfRecord(0x9999);
-  ASSERT_TRUE(
-      ErrorEquals(instrprof_error::hash_mismatch, RecordOr.takeError()));
-
   // Missing functions give a unknown_function error.
-  RecordOr = Reader->getMemProfRecord(0x1111);
+  auto RecordOr = Reader->getMemProfRecord(0x1111);
   ASSERT_TRUE(
       ErrorEquals(instrprof_error::unknown_function, RecordOr.takeError()));
 }
@@ -681,19 +659,15 @@ TEST_F(InstrProfTest, test_memprof_merge) {
   ASSERT_THAT_ERROR(Writer2.mergeProfileKind(InstrProfKind::MemProf),
                     Succeeded());
 
-  const FrameIdMapTy IdToFrameMap = getFrameMapping();
-  for (const auto &I : IdToFrameMap) {
-    Writer2.addMemProfFrame(I.first, I.getSecond(), Err);
-  }
-
-  const auto CSIdToCallStackMap = getCallStackMapping();
-  for (const auto &[CSId, CallStack] : CSIdToCallStackMap)
-    Writer2.addMemProfCallStack(CSId, CallStack, Err);
-
   const IndexedMemProfRecord IndexedMR = makeRecordV2(
       /*AllocFrames=*/{0x111, 0x222},
       /*CallSiteFrames=*/{}, makePartialMIB(), memprof::getHotColdSchema());
-  Writer2.addMemProfRecord(/*Id=*/0x9999, IndexedMR);
+
+  memprof::IndexedMemProfData MemProfData;
+  MemProfData.Frames = getFrameMapping();
+  MemProfData.CallStacks = getCallStackMapping();
+  MemProfData.Records.try_emplace(0x9999, IndexedMR);
+  Writer2.addMemProfData(MemProfData, Err);
 
   ASSERT_THAT_ERROR(Writer.mergeProfileKind(Writer2.getProfileKind()),
                     Succeeded());
@@ -714,9 +688,10 @@ TEST_F(InstrProfTest, test_memprof_merge) {
 
   std::optional<memprof::FrameId> LastUnmappedFrameId;
 
-  memprof::FrameIdConverter<decltype(IdToFrameMap)> FrameIdConv(IdToFrameMap);
-  memprof::CallStackIdConverter<decltype(CSIdToCallStackMap)> CSIdConv(
-      CSIdToCallStackMap, FrameIdConv);
+  memprof::FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
+      MemProfData.Frames);
+  memprof::CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
+      MemProfData.CallStacks, FrameIdConv);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
       IndexedMR.toMemProfRecord(CSIdConv);
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index c90669811e60a..79b644dc5a528 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -315,7 +315,7 @@ TEST(MemProf, RecordSerializationRoundTripVerion2) {
   IndexedMemProfRecord Record;
   for (const auto &CSId : CallStackIds) {
     // Use the same info block for both allocation sites.
-    Record.AllocSites.emplace_back(llvm::SmallVector<FrameId>(), CSId, Info);
+    Record.AllocSites.emplace_back(CSId, Info);
   }
   Record.CallSiteIds.assign(CallSiteIds);
 
@@ -346,8 +346,7 @@ TEST(MemProf, RecordSerializationRoundTripVersion2HotColdSchema) {
   IndexedMemProfRecord Record;
   for (const auto &CSId : CallStackIds) {
     // Use the same info block for both allocation sites.
-    Record.AllocSites.emplace_back(llvm::SmallVector<FrameId>(), CSId, Info,
-                                   Schema);
+    Record.AllocSites.emplace_back(CSId, Info, Schema);
   }
   Record.CallSiteIds.assign(CallSiteIds);
 
@@ -455,26 +454,26 @@ TEST(MemProf, SymbolizationFilter) {
 }
 
 TEST(MemProf, BaseMemProfReader) {
-  llvm::DenseMap<FrameId, Frame> FrameIdMap;
+  llvm::memprof::IndexedMemProfData MemProfData;
   Frame F1(/*Hash=*/IndexedMemProfRecord::getGUID("foo"), /*LineOffset=*/20,
            /*Column=*/5, /*IsInlineFrame=*/true);
   Frame F2(/*Hash=*/IndexedMemProfRecord::getGUID("bar"), /*LineOffset=*/10,
            /*Column=*/2, /*IsInlineFrame=*/false);
-  FrameIdMap.insert({F1.hash(), F1});
-  FrameIdMap.insert({F2.hash(), F2});
+  MemProfData.Frames.insert({F1.hash(), F1});
+  MemProfData.Frames.insert({F2.hash(), F2});
+
+  llvm::SmallVector<FrameId> CallStack{F1.hash(), F2.hash()};
+  CallStackId CSId = llvm::memprof::hashCallStack(CallStack);
+  MemProfData.CallStacks.try_emplace(CSId, CallStack);
 
-  llvm::MapVector<llvm::GlobalValue::GUID, IndexedMemProfRecord> ProfData;
   IndexedMemProfRecord FakeRecord;
   MemInfoBlock Block;
   Block.AllocCount = 1U, Block.TotalAccessDensity = 4,
   Block.TotalLifetime = 200001;
-  std::array<FrameId, 2> CallStack{F1.hash(), F2.hash()};
-  FakeRecord.AllocSites.emplace_back(
-      /*CS=*/CallStack, /*CSId=*/llvm::memprof::hashCallStack(CallStack),
-      /*MB=*/Block);
-  ProfData.insert({F1.hash(), FakeRecord});
+  FakeRecord.AllocSites.emplace_back(/*CSId=*/CSId, /*MB=*/Block);
+  MemProfData.Records.insert({F1.hash(), FakeRecord});
 
-  MemProfReader Reader(FrameIdMap, ProfData);
+  MemProfReader Reader(MemProfData);
 
   llvm::SmallVector<MemProfRecord, 1> Records;
   for (const auto &KeyRecordPair : Reader) {
@@ -491,31 +490,28 @@ TEST(MemProf, BaseMemProfReader) {
 }
 
 TEST(MemProf, BaseMemProfReaderWithCSIdMap) {
-  llvm::DenseMap<FrameId, Frame> FrameIdMap;
+  llvm::memprof::IndexedMemProfData MemProfData;
   Frame F1(/*Hash=*/IndexedMemProfRecord::getGUID("foo"), /*LineOffset=*/20,
            /*Column=*/5, /*IsInlineFrame=*/true);
   Frame F2(/*Hash=*/IndexedMemProfRecord::getGUID("bar"), /*LineOffset=*/10,
            /*Column=*/2, /*IsInlineFrame=*/false);
-  FrameIdMap.insert({F1.hash(), F1});
-  FrameIdMap.insert({F2.hash(), F2});
+  MemProfData.Frames.insert({F1.hash(), F1});
+  MemProfData.Frames.insert({F2.hash(), F2});
 
-  llvm::DenseMap<CallStackId, llvm::SmallVector<FrameId>> CSIdMap;
   llvm::SmallVector<FrameId> CallStack = {F1.hash(), F2.hash()};
   CallStackId CSId = llvm::memprof::hashCallStack(CallStack);
-  CSIdMap.insert({CSId, CallStack});
+  MemProfData.CallStacks.insert({CSId, CallStack});
 
-  llvm::MapVector<llvm::GlobalValue::GUID, IndexedMemProfRecord> ProfData;
   IndexedMemProfRecord FakeRecord;
   MemInfoBlock Block;
   Block.AllocCount = 1U, Block.TotalAccessDensity = 4,
   Block.TotalLifetime = 200001;
   FakeRecord.AllocSites.emplace_back(
-      /*CS=*/llvm::SmallVector<FrameId>(),
       /*CSId=*/llvm::memprof::hashCallStack(CallStack),
       /*MB=*/Block);
-  ProfData.insert({F1.hash(), FakeRecord});
+  MemProfData.Records.insert({F1.hash(), FakeRecord});
 
-  MemProfReader Reader(FrameIdMap, CSIdMap, ProfData);
+  MemProfReader Reader(MemProfData);
 
   llvm::SmallVector<MemProfRecord, 1> Records;
   for (const auto &KeyRecordPair : Reader) {
@@ -610,7 +606,7 @@ MemInfoBlock makePartialMIB() {
 TEST(MemProf, MissingCallStackId) {
   // Use a non-existent CallStackId to trigger a mapping error in
   // toMemProfRecord.
-  llvm::memprof::IndexedAllocationInfo AI({}, 0xdeadbeefU, makePartialMIB(),
+  llvm::memprof::IndexedAllocationInfo AI(0xdeadbeefU, makePartialMIB(),
                                           llvm::memprof::getHotColdSchema());
 
   IndexedMemProfRecord IndexedMR;
@@ -633,7 +629,7 @@ TEST(MemProf, MissingCallStackId) {
 }
 
 TEST(MemProf, MissingFrameId) {
-  llvm::memprof::IndexedAllocationInfo AI({}, 0x222, makePartialMIB(),
+  llvm::memprof::IndexedAllocationInfo AI(0x222, makePartialMIB(),
                                           llvm::memprof::getHotColdSchema());
 
   IndexedMemProfRecord IndexedMR;
@@ -663,8 +659,8 @@ TEST(MemProf, RadixTreeBuilderEmpty) {
   llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
   llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
       FrameHistogram =
-          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
-  llvm::memprof::CallStackRadixTreeBuilder Builder;
+          llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
+  llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty());
@@ -681,8 +677,8 @@ TEST(MemProf, RadixTreeBuilderOne) {
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
   llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
       FrameHistogram =
-          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
-  llvm::memprof::CallStackRadixTreeBuilder Builder;
+          llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
+  llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({
@@ -708,8 +704,8 @@ TEST(MemProf, RadixTreeBuilderTwo) {
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
   llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
       FrameHistogram =
-          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
-  llvm::memprof::CallStackRadixTreeBuilder Builder;
+          llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
+  llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
@@ -746,8 +742,8 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
   MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS4), CS4});
   llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
       FrameHistogram =
-          llvm::memprof::computeFrameHistogram(MemProfCallStackData);
-  llvm::memprof::CallStackRadixTreeBuilder Builder;
+          llvm::memprof::computeFrameHistogram<FrameId>(MemProfCallStackData);
+  llvm::memprof::CallStackRadixTreeBuilder<FrameId> Builder;
   Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
                 FrameHistogram);
   EXPECT_THAT(Builder.getRadixArray(),
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index 4f2cfa6b06ecd..4eedab124bfa0 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -1844,3 +1844,71 @@ define void @foo(i32 %arg, float %farg) {
   Ctx.revert();
   EXPECT_FALSE(FAdd->getFastMathFlags() != OrigFMF);
 }
+
+// IRSnapshotChecker is only defined in debug mode.
+#ifndef NDEBUG
+
+TEST_F(TrackerTest, IRSnapshotCheckerNoChanges) {
+  parseIR(C, R"IR(
+define i32 @foo(i32 %arg) {
+  %add0 = add i32 %arg, %arg
+  ret i32 %add0
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  [[maybe_unused]] auto *F = Ctx.createFunction(&LLVMF);
+  sandboxir::IRSnapshotChecker Checker(Ctx);
+  Checker.save();
+  Checker.expectNoDiff();
+}
+
+TEST_F(TrackerTest, IRSnapshotCheckerDiesWithUnexpectedChanges) {
+  parseIR(C, R"IR(
+define i32 @foo(i32 %arg) {
+  %add0 = add i32 %arg, %arg
+  %add1 = add i32 %add0, %arg
+  ret i32 %add1
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto *F = Ctx.createFunction(&LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  sandboxir::Instruction *Add0 = &*It++;
+  sandboxir::Instruction *Add1 = &*It++;
+  sandboxir::IRSnapshotChecker Checker(Ctx);
+  Checker.save();
+  Add1->setOperand(1, Add0);
+  EXPECT_DEATH(Checker.expectNoDiff(), "Found IR difference");
+}
+
+TEST_F(TrackerTest, IRSnapshotCheckerSaveMultipleTimes) {
+  parseIR(C, R"IR(
+define i32 @foo(i32 %arg) {
+  %add0 = add i32 %arg, %arg
+  %add1 = add i32 %add0, %arg
+  ret i32 %add1
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto *F = Ctx.createFunction(&LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  sandboxir::Instruction *Add0 = &*It++;
+  sandboxir::Instruction *Add1 = &*It++;
+  sandboxir::IRSnapshotChecker Checker(Ctx);
+  Checker.save();
+  Add1->setOperand(1, Add0);
+  // Now IR differs from the last snapshot. Let's take a new snapshot.
+  Checker.save();
+  // The new snapshot should have replaced the old one, so this should succeed.
+  Checker.expectNoDiff();
+}
+
+#endif // NDEBUG
diff --git a/llvm/unittests/TextAPI/TextStubV5Tests.cpp b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
index 62fdd79ae4970..24577b3ec6148 100644
--- a/llvm/unittests/TextAPI/TextStubV5Tests.cpp
+++ b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
@@ -2077,7 +2077,8 @@ TEST(TBDv5, RemoveIF) {
             "x86_64-macos"
         ],
       "attributes": [
-            "flat_namespace"
+            "flat_namespace",
+            "not_for_dyld_shared_cache"
         ]
     }
   ],
@@ -2242,6 +2243,7 @@ TEST(TBDv5, RemoveIF) {
   EXPECT_EQ(PackedVersion(1, 2, 0), RemovedFile->getCurrentVersion());
   EXPECT_EQ(PackedVersion(1, 1, 0), RemovedFile->getCompatibilityVersion());
   EXPECT_TRUE(RemovedFile->isApplicationExtensionSafe());
+  EXPECT_TRUE(RemovedFile->isOSLibNotForSharedCache());
   EXPECT_FALSE(RemovedFile->isTwoLevelNamespace());
   EXPECT_EQ(0U, RemovedFile->documents().size());
 
diff --git a/llvm/unittests/Transforms/Instrumentation/MemProfUseTest.cpp b/llvm/unittests/Transforms/Instrumentation/MemProfUseTest.cpp
index cd0e8357a2b2d..f672ed2b1251f 100644
--- a/llvm/unittests/Transforms/Instrumentation/MemProfUseTest.cpp
+++ b/llvm/unittests/Transforms/Instrumentation/MemProfUseTest.cpp
@@ -11,8 +11,11 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/InstrProfWriter.h"
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Testing/Support/Error.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 
 #include "gmock/gmock.h"
@@ -21,9 +24,11 @@
 namespace {
 using namespace llvm;
 using namespace llvm::memprof;
-using testing::FieldsAre;
+using testing::Contains;
+using testing::ElementsAre;
 using testing::Pair;
 using testing::SizeIs;
+using testing::UnorderedElementsAre;
 
 TEST(MemProf, ExtractDirectCallsFromIR) {
   // The following IR is generated from:
@@ -101,12 +106,12 @@ declare !dbg !19 void @_Z2f3v()
 
   // Verify that call sites show up in the ascending order of their source
   // locations.
-  EXPECT_THAT(CallSites[0],
-              Pair(FieldsAre(1U, 3U), IndexedMemProfRecord::getGUID("_Z2f1v")));
-  EXPECT_THAT(CallSites[1],
-              Pair(FieldsAre(2U, 3U), IndexedMemProfRecord::getGUID("_Z2f2v")));
-  EXPECT_THAT(CallSites[2],
-              Pair(FieldsAre(2U, 9U), IndexedMemProfRecord::getGUID("_Z2f3v")));
+  EXPECT_THAT(CallSites[0], Pair(LineLocation(1, 3),
+                                 IndexedMemProfRecord::getGUID("_Z2f1v")));
+  EXPECT_THAT(CallSites[1], Pair(LineLocation(2, 3),
+                                 IndexedMemProfRecord::getGUID("_Z2f2v")));
+  EXPECT_THAT(CallSites[2], Pair(LineLocation(2, 9),
+                                 IndexedMemProfRecord::getGUID("_Z2f3v")));
 }
 
 TEST(MemProf, ExtractDirectCallsFromIRInline) {
@@ -201,9 +206,9 @@ declare !dbg !25 void @_Z2g2v() local_unnamed_addr
   const auto &[FooCallerGUID, FooCallSites] = *FooIt;
   EXPECT_EQ(FooCallerGUID, IndexedMemProfRecord::getGUID("_Z3foov"));
   ASSERT_THAT(FooCallSites, SizeIs(2));
-  EXPECT_THAT(FooCallSites[0], Pair(FieldsAre(1U, 3U),
+  EXPECT_THAT(FooCallSites[0], Pair(LineLocation(1, 3),
                                     IndexedMemProfRecord::getGUID("_ZL2f3v")));
-  EXPECT_THAT(FooCallSites[1], Pair(FieldsAre(2U, 9U),
+  EXPECT_THAT(FooCallSites[1], Pair(LineLocation(2, 9),
                                     IndexedMemProfRecord::getGUID("_ZL2g3v")));
 
   auto F2It = Calls.find(IndexedMemProfRecord::getGUID("_ZL2f2v"));
@@ -211,15 +216,15 @@ declare !dbg !25 void @_Z2g2v() local_unnamed_addr
   const auto &[F2CallerGUID, F2CallSites] = *F2It;
   EXPECT_EQ(F2CallerGUID, IndexedMemProfRecord::getGUID("_ZL2f2v"));
   ASSERT_THAT(F2CallSites, SizeIs(1));
-  EXPECT_THAT(F2CallSites[0],
-              Pair(FieldsAre(2U, 3U), IndexedMemProfRecord::getGUID("_Z2f1v")));
+  EXPECT_THAT(F2CallSites[0], Pair(LineLocation(2, 3),
+                                   IndexedMemProfRecord::getGUID("_Z2f1v")));
 
   auto F3It = Calls.find(IndexedMemProfRecord::getGUID("_ZL2f3v"));
   ASSERT_NE(F3It, Calls.end());
   const auto &[F3CallerGUID, F3CallSites] = *F3It;
   EXPECT_EQ(F3CallerGUID, IndexedMemProfRecord::getGUID("_ZL2f3v"));
   ASSERT_THAT(F3CallSites, SizeIs(1));
-  EXPECT_THAT(F3CallSites[0], Pair(FieldsAre(1U, 10U),
+  EXPECT_THAT(F3CallSites[0], Pair(LineLocation(1, 10),
                                    IndexedMemProfRecord::getGUID("_ZL2f2v")));
 
   auto G3It = Calls.find(IndexedMemProfRecord::getGUID("_ZL2g3v"));
@@ -227,10 +232,10 @@ declare !dbg !25 void @_Z2g2v() local_unnamed_addr
   const auto &[G3CallerGUID, G3CallSites] = *G3It;
   EXPECT_EQ(G3CallerGUID, IndexedMemProfRecord::getGUID("_ZL2g3v"));
   ASSERT_THAT(G3CallSites, SizeIs(2));
-  EXPECT_THAT(G3CallSites[0],
-              Pair(FieldsAre(1U, 8U), IndexedMemProfRecord::getGUID("_Z2g1v")));
-  EXPECT_THAT(G3CallSites[1],
-              Pair(FieldsAre(2U, 3U), IndexedMemProfRecord::getGUID("_Z2g2v")));
+  EXPECT_THAT(G3CallSites[0], Pair(LineLocation(1, 8),
+                                   IndexedMemProfRecord::getGUID("_Z2g1v")));
+  EXPECT_THAT(G3CallSites[1], Pair(LineLocation(2, 3),
+                                   IndexedMemProfRecord::getGUID("_Z2g2v")));
 }
 
 TEST(MemProf, ExtractDirectCallsFromIRCallingNew) {
@@ -296,6 +301,190 @@ attributes #2 = { builtin allocsize(0) }
   const auto &[FooCallerGUID, FooCallSites] = *FooIt;
   EXPECT_EQ(FooCallerGUID, IndexedMemProfRecord::getGUID("_Z3foov"));
   ASSERT_THAT(FooCallSites, SizeIs(1));
-  EXPECT_THAT(FooCallSites[0], Pair(FieldsAre(1U, 10U), 0));
+  EXPECT_THAT(FooCallSites[0], Pair(LineLocation(1, 10), 0));
+}
+
+// Populate those fields returned by getHotColdSchema.
+MemInfoBlock makePartialMIB() {
+  MemInfoBlock MIB;
+  MIB.AllocCount = 1;
+  MIB.TotalSize = 5;
+  MIB.TotalLifetime = 10;
+  MIB.TotalLifetimeAccessDensity = 23;
+  return MIB;
+}
+
+IndexedMemProfRecord
+makeRecordV2(std::initializer_list<::llvm::memprof::CallStackId> AllocFrames,
+             std::initializer_list<::llvm::memprof::CallStackId> CallSiteFrames,
+             const MemInfoBlock &Block, const memprof::MemProfSchema &Schema) {
+  llvm::memprof::IndexedMemProfRecord MR;
+  for (const auto &CSId : AllocFrames) {
+    // We don't populate IndexedAllocationInfo::CallStack because we use it only
+    // in Version1.
+    MR.AllocSites.emplace_back(CSId, Block, Schema);
+  }
+  for (const auto &CSId : CallSiteFrames)
+    MR.CallSiteIds.push_back(CSId);
+  return MR;
+}
+
+static const auto Err = [](Error E) {
+  FAIL() << E;
+  consumeError(std::move(E));
+};
+
+// Make sure that we can undrift direct calls.
+TEST(MemProf, ComputeUndriftingMap) {
+  // Suppose that the source code has changed from:
+  //
+  //   void bar();
+  //   void baz();
+  //   void zzz();
+  //
+  //   void foo() {
+  //     /**/ bar();  // LineLocation(1, 8)
+  //     zzz();       // LineLocation(2, 3)
+  //     baz();       // LineLocation(3, 3)
+  //   }
+  //
+  // to:
+  //
+  //   void bar();
+  //   void baz();
+  //
+  //   void foo() {
+  //     bar();        // LineLocation(1, 3)
+  //     /**/ baz();   // LineLocation(2, 8)
+  //   }
+  //
+  // Notice that the calls to bar and baz have drifted while zzz has been
+  // removed.
+  StringRef IR = R"IR(
+define dso_local void @_Z3foov() #0 !dbg !10 {
+entry:
+  call void @_Z3barv(), !dbg !13
+  call void @_Z3bazv(), !dbg !14
+  ret void, !dbg !15
+}
+
+declare !dbg !16 void @_Z3barv() #1
+
+declare !dbg !17 void @_Z3bazv() #1
+
+attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "foobar.cc", directory: "/")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 1, !"MemProfProfileFilename", !"memprof.profraw"}
+!6 = !{i32 8, !"PIC Level", i32 2}
+!7 = !{i32 7, !"PIE Level", i32 2}
+!8 = !{i32 7, !"uwtable", i32 2}
+!9 = !{!"clang"}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 5, column: 3, scope: !10)
+!14 = !DILocation(line: 6, column: 8, scope: !10)
+!15 = !DILocation(line: 7, column: 1, scope: !10)
+!16 = !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 1, type: !11, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!17 = !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 2, type: !11, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+)IR";
+
+  LLVMContext Ctx;
+  SMDiagnostic SMErr;
+  std::unique_ptr<Module> M = parseAssemblyString(IR, SMErr, Ctx);
+  ASSERT_TRUE(M);
+
+  auto *F = M->getFunction("_Z3foov");
+  ASSERT_NE(F, nullptr);
+
+  TargetLibraryInfoWrapperPass WrapperPass;
+  auto &TLI = WrapperPass.getTLI(*F);
+  auto Calls = extractCallsFromIR(*M, TLI);
+
+  uint64_t GUIDFoo = IndexedMemProfRecord::getGUID("_Z3foov");
+  uint64_t GUIDBar = IndexedMemProfRecord::getGUID("_Z3barv");
+  uint64_t GUIDBaz = IndexedMemProfRecord::getGUID("_Z3bazv");
+  uint64_t GUIDZzz = IndexedMemProfRecord::getGUID("_Z3zzzv");
+
+  // Verify that extractCallsFromIR extracts caller-callee pairs as expected.
+  EXPECT_THAT(Calls,
+              UnorderedElementsAre(Pair(
+                  GUIDFoo, ElementsAre(Pair(LineLocation(1, 3), GUIDBar),
+                                       Pair(LineLocation(2, 8), GUIDBaz)))));
+
+  llvm::InstrProfWriter Writer;
+  std::unique_ptr<IndexedInstrProfReader> Reader;
+
+  const MemInfoBlock MIB = makePartialMIB();
+
+  Writer.setMemProfVersionRequested(memprof::Version3);
+  Writer.setMemProfFullSchema(false);
+
+  ASSERT_THAT_ERROR(Writer.mergeProfileKind(InstrProfKind::MemProf),
+                    Succeeded());
+
+  const IndexedMemProfRecord IndexedMR = makeRecordV2(
+      /*AllocFrames=*/{0x111, 0x222, 0x333},
+      /*CallSiteFrames=*/{}, MIB, memprof::getHotColdSchema());
+
+  memprof::IndexedMemProfData MemProfData;
+  // The call sites within foo.
+  MemProfData.Frames.try_emplace(0, GUIDFoo, 1, 8, false);
+  MemProfData.Frames.try_emplace(1, GUIDFoo, 2, 3, false);
+  MemProfData.Frames.try_emplace(2, GUIDFoo, 3, 3, false);
+  // Line/column numbers below don't matter.
+  MemProfData.Frames.try_emplace(3, GUIDBar, 9, 9, false);
+  MemProfData.Frames.try_emplace(4, GUIDZzz, 9, 9, false);
+  MemProfData.Frames.try_emplace(5, GUIDBaz, 9, 9, false);
+  MemProfData.CallStacks.try_emplace(
+      0x111,
+      std::initializer_list<memprof::FrameId>{3, 0}); // bar called by foo
+  MemProfData.CallStacks.try_emplace(
+      0x222,
+      std::initializer_list<memprof::FrameId>{4, 1}); // zzz called by foo
+  MemProfData.CallStacks.try_emplace(
+      0x333,
+      std::initializer_list<memprof::FrameId>{5, 2}); // baz called by foo
+  MemProfData.Records.try_emplace(0x9999, IndexedMR);
+  Writer.addMemProfData(MemProfData, Err);
+
+  auto Profile = Writer.writeBuffer();
+
+  auto ReaderOrErr =
+      IndexedInstrProfReader::create(std::move(Profile), nullptr);
+  EXPECT_THAT_ERROR(ReaderOrErr.takeError(), Succeeded());
+  Reader = std::move(ReaderOrErr.get());
+
+  // Verify that getMemProfCallerCalleePairs extracts caller-callee pairs as
+  // expected.
+  auto Pairs = Reader->getMemProfCallerCalleePairs();
+  ASSERT_THAT(Pairs, SizeIs(4));
+  ASSERT_THAT(
+      Pairs,
+      Contains(Pair(GUIDFoo, ElementsAre(Pair(LineLocation(1, 8), GUIDBar),
+                                         Pair(LineLocation(2, 3), GUIDZzz),
+                                         Pair(LineLocation(3, 3), GUIDBaz)))));
+
+  // Verify that computeUndriftMap identifies undrifting opportunities:
+  //
+  //   Profile                 IR
+  //   (Line: 1, Column: 8) -> (Line: 1, Column: 3)
+  //   (Line: 3, Column: 3) -> (Line: 2, Column: 8)
+  auto UndriftMap = computeUndriftMap(*M, Reader.get(), TLI);
+  ASSERT_THAT(UndriftMap,
+              UnorderedElementsAre(Pair(
+                  GUIDFoo, UnorderedElementsAre(
+                               Pair(LineLocation(1, 8), LineLocation(1, 3)),
+                               Pair(LineLocation(3, 3), LineLocation(2, 8))))));
 }
 } // namespace
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
index 061d57c31ce23..e6bb4b4684d26 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
@@ -194,7 +194,7 @@ define void @foo(i8 %v1, ptr %ptr) {
   auto *Call = cast<sandboxir::CallInst>(&*It++);
   auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   EXPECT_TRUE(isa<llvm::sandboxir::MemDGNode>(DAG.getNode(Store)));
   EXPECT_TRUE(isa<llvm::sandboxir::MemDGNode>(DAG.getNode(Load)));
@@ -224,7 +224,7 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   auto *S0 = cast<sandboxir::StoreInst>(&*It++);
   auto *S1 = cast<sandboxir::StoreInst>(&*It++);
   auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   auto Span = DAG.extend({&*BB->begin(), BB->getTerminator()});
   // Check extend().
   EXPECT_EQ(Span.top(), &*BB->begin());
@@ -285,7 +285,7 @@ define i8 @foo(i8 %v0, i8 %v1) {
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
   auto It = BB->begin();
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
 
   auto *AddN0 = DAG.getNode(cast<sandboxir::BinaryOperator>(&*It++));
@@ -332,7 +332,7 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   auto *S1 = cast<sandboxir::StoreInst>(&*It++);
   [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
 
   auto *S0N = cast<sandboxir::MemDGNode>(DAG.getNode(S0));
@@ -366,7 +366,7 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   auto *S1 = cast<sandboxir::StoreInst>(&*It++);
   auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
 
   auto *S0N = cast<sandboxir::MemDGNode>(DAG.getNode(S0));
@@ -436,7 +436,7 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   sandboxir::Context Ctx(C);
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto It = BB->begin();
   auto *Store0N = cast<sandboxir::MemDGNode>(
@@ -461,7 +461,7 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v0, i8 %v1) {
   sandboxir::Context Ctx(C);
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto It = BB->begin();
   auto *Store0N = cast<sandboxir::MemDGNode>(
@@ -487,7 +487,7 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1) {
   sandboxir::Context Ctx(C);
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto It = BB->begin();
   auto *Ld0N = cast<sandboxir::MemDGNode>(
@@ -512,7 +512,7 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v) {
   sandboxir::Context Ctx(C);
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto It = BB->begin();
   auto *Store0N = cast<sandboxir::MemDGNode>(
@@ -542,7 +542,7 @@ define void @foo(float %v1, float %v2) {
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()->getPrevNode()});
 
   auto It = BB->begin();
@@ -574,7 +574,7 @@ define void @foo() {
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()->getPrevNode()});
 
   auto It = BB->begin();
@@ -606,7 +606,7 @@ define void @foo(i8 %v0, i8 %v1, ptr %ptr) {
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()->getPrevNode()});
 
   auto It = BB->begin();
@@ -637,7 +637,7 @@ define void @foo(ptr %ptr) {
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()->getPrevNode()});
 
   auto It = BB->begin();
@@ -664,7 +664,7 @@ define void @foo(ptr %ptr) {
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()->getPrevNode()});
 
   auto It = BB->begin();
@@ -695,7 +695,7 @@ define void @foo() {
   auto *F = Ctx.createFunction(LLVMF);
   auto *BB = &*F->begin();
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()->getPrevNode()});
 
   auto It = BB->begin();
@@ -728,7 +728,7 @@ define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %v4, i8 %v5) {
   auto *S3 = cast<sandboxir::StoreInst>(&*It++);
   auto *S4 = cast<sandboxir::StoreInst>(&*It++);
   auto *S5 = cast<sandboxir::StoreInst>(&*It++);
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   {
     // Scenario 1: Build new DAG
     auto NewIntvl = DAG.extend({S3, S3});
@@ -788,7 +788,7 @@ define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %v4, i8 %v5) {
 
   {
     // Check UnscheduledSuccs when a node is scheduled
-    sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+    sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
     DAG.extend({S2, S2});
     auto *S2N = cast<sandboxir::MemDGNode>(DAG.getNode(S2));
     S2N->setScheduled(true);
@@ -798,3 +798,63 @@ define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %v4, i8 %v5) {
     EXPECT_EQ(S1N->getNumUnscheduledSuccs(), 0u); // S1 is scheduled
   }
 }
+
+TEST_F(DependencyGraphTest, CreateInstrCallback) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %arg) {
+  store i8 %v1, ptr %ptr
+  store i8 %v2, ptr %ptr
+  store i8 %v3, ptr %ptr
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *S1 = cast<sandboxir::StoreInst>(&*It++);
+  [[maybe_unused]] auto *S2 = cast<sandboxir::StoreInst>(&*It++);
+  auto *S3 = cast<sandboxir::StoreInst>(&*It++);
+
+  // Check new instruction callback.
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
+  DAG.extend({S1, S3});
+  auto *Arg = F->getArg(3);
+  auto *Ptr = S1->getPointerOperand();
+  sandboxir::StoreInst *NewS =
+      sandboxir::StoreInst::create(Arg, Ptr, Align(8), S3->getIterator(),
+                                   /*IsVolatile=*/true, Ctx);
+  auto *NewSN = DAG.getNode(NewS);
+  EXPECT_TRUE(NewSN != nullptr);
+  // TODO: Check the dependencies to/from NewSN after they land.
+  // TODO: Check the MemDGNode chain.
+}
+
+TEST_F(DependencyGraphTest, EraseInstrCallback) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %arg) {
+  store i8 %v1, ptr %ptr
+  store i8 %v2, ptr %ptr
+  store i8 %v3, ptr %ptr
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *S1 = cast<sandboxir::StoreInst>(&*It++);
+  auto *S2 = cast<sandboxir::StoreInst>(&*It++);
+  auto *S3 = cast<sandboxir::StoreInst>(&*It++);
+
+  // Check erase instruction callback.
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
+  DAG.extend({S1, S3});
+  S2->eraseFromParent();
+  auto *DeletedN = DAG.getNodeOrNull(S2);
+  EXPECT_TRUE(DeletedN == nullptr);
+  // TODO: Check the dependencies to/from NewSN after they land.
+  // TODO: Check the MemDGNode chain.
+}
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp
index 94a5791442974..c5e44a97976a7 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp
@@ -70,7 +70,7 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   auto *S1 = cast<sandboxir::StoreInst>(&*It++);
   auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
-  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto *SN0 = DAG.getNode(S0);
   auto *SN1 = DAG.getNode(S1);
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SeedCollectorTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SeedCollectorTest.cpp
index 95a8f66ac1e66..99a13801c7c33 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SeedCollectorTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SeedCollectorTest.cpp
@@ -394,12 +394,16 @@ define void @foo(ptr noalias %ptr, float %val) {
 
 TEST_F(SeedBundleTest, VectorStores) {
   parseIR(C, R"IR(
-define void @foo(ptr noalias %ptr, <2 x float> %val) {
+define void @foo(ptr noalias %ptr, <2 x float> %val0, i64 %val1) {
 bb:
   %ptr0 = getelementptr float, ptr %ptr, i32 0
   %ptr1 = getelementptr float, ptr %ptr, i32 1
-  store <2 x float> %val, ptr %ptr1
-  store <2 x float> %val, ptr %ptr0
+  %ptr2 = getelementptr i64, ptr %ptr, i32 2
+  store <2 x float> %val0, ptr %ptr1
+  store <2 x float> %val0, ptr %ptr0
+  store atomic i64 %val1, ptr %ptr2 unordered, align 8
+  store volatile i64 %val1, ptr %ptr2
+
   ret void
 }
 )IR");
@@ -418,7 +422,7 @@ define void @foo(ptr noalias %ptr, <2 x float> %val) {
   sandboxir::SeedCollector SC(&*BB, SE);
 
   // Find the stores
-  auto It = std::next(BB->begin(), 2);
+  auto It = std::next(BB->begin(), 3);
   // StX with X as the order by offset in memory
   auto *St1 = &*It++;
   auto *St0 = &*It++;
@@ -426,6 +430,8 @@ define void @foo(ptr noalias %ptr, <2 x float> %val) {
   auto StoreSeedsRange = SC.getStoreSeeds();
   EXPECT_EQ(range_size(StoreSeedsRange), 1u);
   auto &SB = *StoreSeedsRange.begin();
+  // isValidMemSeed check: The atomic and volatile stores should not
+  // be included in the bundle, but the vector stores should be.
   ExpectThatElementsAre(SB, {St0, St1});
 }
 
@@ -466,5 +472,50 @@ define void @foo(ptr noalias %ptr, float %v, <2 x float> %val) {
   auto StoreSeedsRange = SC.getStoreSeeds();
   EXPECT_EQ(range_size(StoreSeedsRange), 1u);
   auto &SB = *StoreSeedsRange.begin();
+  // isValidMemSeedCheck here: all of the three stores should be included.
   ExpectThatElementsAre(SB, {St0, St1, St3});
 }
+
+TEST_F(SeedBundleTest, VectorLoads) {
+  parseIR(C, R"IR(
+define void @foo(ptr noalias %ptr, <2 x float> %val0) {
+bb:
+  %ptr0 = getelementptr float, ptr %ptr, i32 0
+  %ptr1 = getelementptr float, ptr %ptr, i32 1
+  %r0 = load <2 x float>, ptr %ptr0
+  %r1 = load <2 x float>, ptr %ptr1
+  %r2 = load atomic i64, ptr %ptr0 unordered, align 8
+  %r3 = load volatile i64, ptr %ptr1
+  %r4 = load void()*, ptr %ptr1
+
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  DominatorTree DT(LLVMF);
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI(TLII);
+  DataLayout DL(M->getDataLayout());
+  LoopInfo LI(DT);
+  AssumptionCache AC(LLVMF);
+  ScalarEvolution SE(LLVMF, TLI, AC, DT, LI);
+
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto BB = F.begin();
+  sandboxir::SeedCollector SC(&*BB, SE);
+
+  // Find the loads
+  auto It = std::next(BB->begin(), 2);
+  // StX with X as the order by offset in memory
+  auto *Ld0 = cast<sandboxir::LoadInst>(&*It++);
+  auto *Ld1 = cast<sandboxir::LoadInst>(&*It++);
+
+  auto LoadSeedsRange = SC.getLoadSeeds();
+  EXPECT_EQ(range_size(LoadSeedsRange), 2u);
+  auto &SB = *LoadSeedsRange.begin();
+  // isValidMemSeed check: The atomic and volatile loads should not
+  // be included in the bundle, the vector stores should be, but the
+  // void-typed load should not.
+  ExpectThatElementsAre(SB, {Ld0, Ld1});
+}
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index 921d7d7975f6a..3dff50c44798d 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -585,6 +585,14 @@ TEST_F(X86Core2TargetTest, SetRegToFP1_4Bits) {
                           OpcodeIs(X86::LD_Fp80m), IsStackDeallocate(10)));
 }
 
+TEST_F(X86Core2TargetTest, SetRegToDf1) {
+  EXPECT_THAT(setRegTo(X86::DF, APInt(1, 1)), ElementsAre(OpcodeIs(X86::STD)));
+}
+
+TEST_F(X86Core2TargetTest, SetRegToDf0) {
+  EXPECT_THAT(setRegTo(X86::DF, APInt(1, 0)), ElementsAre(OpcodeIs(X86::CLD)));
+}
+
 TEST_F(X86Core2Avx512TargetTest, FillMemoryOperands_ADD64rm) {
   const Instruction &I = getInstr(X86::ADD64rm);
   InstructionTemplate IT(&I);
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index c8186d6e69523..163a1a78a2ec4 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3056,6 +3056,14 @@ static bool SimplifyTree(TreePatternNodePtr &N) {
       !N->getExtType(0).empty() &&
       N->getExtType(0) == N->getChild(0).getExtType(0) &&
       N->getName().empty()) {
+    if (!N->getPredicateCalls().empty()) {
+      std::string Str;
+      raw_string_ostream OS(Str);
+      OS << *N
+         << "\n trivial bitconvert node should not have predicate calls\n";
+      PrintFatalError(Str);
+      return false;
+    }
     N = N->getChildShared(0);
     SimplifyTree(N);
     return true;
diff --git a/llvm/utils/TableGen/X86ManualFoldTables.def b/llvm/utils/TableGen/X86ManualFoldTables.def
index 4a58deaa0ff1b..003712ae124c7 100644
--- a/llvm/utils/TableGen/X86ManualFoldTables.def
+++ b/llvm/utils/TableGen/X86ManualFoldTables.def
@@ -31,18 +31,18 @@ NOFOLD(VCOMPRESSPSZrrk)
 NOFOLD(VCVTPS2PHZ128rrk)
 NOFOLD(VCVTPS2PHZ256rrk)
 NOFOLD(VCVTPS2PHZrrk)
-NOFOLD(VEXTRACTF32x4Z256rrik)
-NOFOLD(VEXTRACTF32x4Zrrik)
-NOFOLD(VEXTRACTF32x8Zrrik)
-NOFOLD(VEXTRACTF64x2Z256rrik)
-NOFOLD(VEXTRACTF64x2Zrrik)
-NOFOLD(VEXTRACTF64x4Zrrik)
-NOFOLD(VEXTRACTI32x4Z256rrik)
-NOFOLD(VEXTRACTI32x4Zrrik)
-NOFOLD(VEXTRACTI32x8Zrrik)
-NOFOLD(VEXTRACTI64x2Z256rrik)
-NOFOLD(VEXTRACTI64x2Zrrik)
-NOFOLD(VEXTRACTI64x4Zrrik)
+NOFOLD(VEXTRACTF32X4Z256rrik)
+NOFOLD(VEXTRACTF32X4Zrrik)
+NOFOLD(VEXTRACTF32X8Zrrik)
+NOFOLD(VEXTRACTF64X2Z256rrik)
+NOFOLD(VEXTRACTF64X2Zrrik)
+NOFOLD(VEXTRACTF64X4Zrrik)
+NOFOLD(VEXTRACTI32X4Z256rrik)
+NOFOLD(VEXTRACTI32X4Zrrik)
+NOFOLD(VEXTRACTI32X8Zrrik)
+NOFOLD(VEXTRACTI64X2Z256rrik)
+NOFOLD(VEXTRACTI64X2Zrrik)
+NOFOLD(VEXTRACTI64X4Zrrik)
 NOFOLD(VMOVAPDZ128mrk)
 NOFOLD(VMOVAPDZ256mrk)
 NOFOLD(VMOVAPDZmrk)
diff --git a/llvm/utils/TableGen/X86ManualInstrMapping.def b/llvm/utils/TableGen/X86ManualInstrMapping.def
index 7c5a6033237fe..2fdc4dc90f340 100644
--- a/llvm/utils/TableGen/X86ManualInstrMapping.def
+++ b/llvm/utils/TableGen/X86ManualInstrMapping.def
@@ -246,14 +246,14 @@ ENTRY(VCVTTPD2DQZ256rm, VCVTTPD2DQYrm)
 ENTRY(VCVTTPD2DQZ256rr, VCVTTPD2DQYrr)
 ENTRY(VDIVPDZ256rm, VDIVPDYrm)
 ENTRY(VDIVPDZ256rr, VDIVPDYrr)
-ENTRY(VEXTRACTF64x2Z256mri, VEXTRACTF128mri)
-ENTRY(VEXTRACTF64x2Z256rri, VEXTRACTF128rri)
-ENTRY(VEXTRACTI64x2Z256mri, VEXTRACTI128mri)
-ENTRY(VEXTRACTI64x2Z256rri, VEXTRACTI128rri)
-ENTRY(VINSERTF64x2Z256rmi, VINSERTF128rmi)
-ENTRY(VINSERTF64x2Z256rri, VINSERTF128rri)
-ENTRY(VINSERTI64x2Z256rmi, VINSERTI128rmi)
-ENTRY(VINSERTI64x2Z256rri, VINSERTI128rri)
+ENTRY(VEXTRACTF64X2Z256mri, VEXTRACTF128mri)
+ENTRY(VEXTRACTF64X2Z256rri, VEXTRACTF128rri)
+ENTRY(VEXTRACTI64X2Z256mri, VEXTRACTI128mri)
+ENTRY(VEXTRACTI64X2Z256rri, VEXTRACTI128rri)
+ENTRY(VINSERTF64X2Z256rmi, VINSERTF128rmi)
+ENTRY(VINSERTF64X2Z256rri, VINSERTF128rri)
+ENTRY(VINSERTI64X2Z256rmi, VINSERTI128rmi)
+ENTRY(VINSERTI64X2Z256rri, VINSERTI128rri)
 ENTRY(VMAXCPDZ256rm, VMAXCPDYrm)
 ENTRY(VMAXCPDZ256rr, VMAXCPDYrr)
 ENTRY(VMAXPDZ256rm, VMAXPDYrm)
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 229a056c8afef..343af16519756 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -277,7 +277,6 @@ if (current_toolchain == default_toolchain) {
       "__assert",
       "__atomic/aliases.h",
       "__atomic/atomic.h",
-      "__atomic/atomic_base.h",
       "__atomic/atomic_flag.h",
       "__atomic/atomic_init.h",
       "__atomic/atomic_lock_free.h",
diff --git a/mlir/docs/Bufferization.md b/mlir/docs/Bufferization.md
index 7d38ebb38535c..e16fe91212a1a 100644
--- a/mlir/docs/Bufferization.md
+++ b/mlir/docs/Bufferization.md
@@ -23,11 +23,6 @@ the resulting `memref` IR has no memory leaks.
 
 ## Deprecated Passes
 
-The old dialect conversion-based bufferization passes have been deprecated and
-should not be used anymore. Most of those passes have already been removed from
-MLIR. One-Shot Bufferize produces in better bufferization results with fewer
-memory allocations and buffer copies.
-
 The buffer deallocation pass has been deprecated in favor of the ownership-based
 buffer deallocation pipeline. The deprecated pass has some limitations that may
 cause memory leaks in the resulting IR.
@@ -276,18 +271,13 @@ semantics (i.e., tensor result or tensor operand) that is not bufferizable
 `to_memref`/`to_tensor` ops around the bufferization boundary.
 
 One-Shot Bufferize can be configured to bufferize only ops from a set of
-dialects with `dialect-filter`. This can be useful for gradually migrating from
-dialect conversion-based bufferization to One-Shot Bufferize. One-Shot Bufferize
-must run first in such a case, because dialect conversion-based bufferization
-generates `to_tensor` ops without the `restrict` unit attribute, which One-Shot
-Bufferize cannot analyze.
+dialects with `dialect-filter`.
 
 One-Shot Bufferize can also be called programmatically with
 [`bufferization::runOneShotBufferize`](https://github.com/llvm/llvm-project/blob/ae2764e835a26bad9774803eca0a6530df2a3e2d/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h#L167).
 Alternatively,
 [`bufferization::bufferizeOp`](https://github.com/llvm/llvm-project/blob/ae2764e835a26bad9774803eca0a6530df2a3e2d/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h#L78)
-skips the analysis and inserts a copy on every buffer write, just like the
-dialect conversion-based bufferization.
+skips the analysis and inserts a copy on every buffer write.
 
 By default, function boundaries are not bufferized. This is because there are
 currently limitations around function graph bufferization: recursive
@@ -484,259 +474,3 @@ conflict detection algorithm, interested users may want to refer to:
 * [Original design document](https://discourse.llvm.org/uploads/short-url/5kckJ3DftYwQokG252teFgw3sYa.pdf)
 * [ODM talk](https://youtu.be/TXEo59CYS9A), ([slides](https://mlir.llvm.org/OpenMeetings/2022-01-13-One-Shot-Bufferization.pdf)).
 * [LLVM Dev Meeting 2023 tutorial slides](https://m-sp.org/downloads/llvm_dev_2023.pdf)
-
-## Migrating from Dialect Conversion-based Bufferization
-
-Both dialect conversion-based bufferization and One-Shot Bufferize generate
-`to_tensor`/`to_memref` ops at the bufferization boundary (when run with
-`allow-unknown-ops`). They can be combined and run in sequence. However,
-One-Shot Bufferize must run first because it cannot analyze those boundary ops.
-To update existing code step-by-step, it may be useful to specify a dialect
-filter for One-Shot Bufferize, so that dialects can be switched over one-by-one.
-
-## Dialect Conversion-based Bufferization
-
-Disclaimer: Most dialect conversion-based bufferization has been migrated to
-One-Shot Bufferize. New users should use One-Shot Bufferize (with or without
-analysis). The following documentation is only for existing users of dialect
-conversion-based bufferization.
-
-This system is a simple application of MLIR's dialect conversion infrastructure.
-The bulk of the code related to bufferization is a set of ordinary
-`ConversionPattern`'s that dialect authors write for converting ops that operate
-on `tensor`'s to ops that operate on `memref`'s. A set of conventions and best
-practices are followed that allow these patterns to be run across multiple
-independent passes (rather than requiring a single huge atomic conversion pass),
-which makes the compilation pipelines scalable, robust, and easy to debug.
-
-This document is targeted at people looking to utilize MLIR's bufferization
-functionality, along with people who want to extend it to cover their own ops.
-
-<a name="the-talk">**NOTE:**</a> Before reading this document, please watch the
-talk "Type Conversions the Not-So-Hard-Way: MLIR's New Bufferization
-Infrastructure"
-([slides](https://drive.google.com/file/d/1FVbzCXxZzS9LBLuvpPNLWJD-XDkt54ky/view?usp=sharing),
-[recording](https://drive.google.com/file/d/1VfVajitgf8ZPnd-HRkJvaJiFLhBsluXN/view?usp=sharing)).
-That talk gives a high-level overview of the bufferization infrastructure and
-important conceptual details related to using the MLIR dialect conversion
-infrastructure.
-
-### Bufferization's place in a compilation pipeline
-
-Bufferization itself does not free any of the buffers that have been allocated,
-nor does it do anything particularly intelligent with the placement of buffers
-w.r.t. control flow. Thus, a realistic compilation pipeline will usually consist
-of:
-
-1.  Bufferization
-1.  Buffer optimizations such as `buffer-hoisting`, `buffer-loop-hoisting`, and
-    `promote-buffers-to-stack`, which do optimizations that are only exposed
-    after bufferization.
-1.  Finally, running the [ownership-based buffer deallocation](OwnershipBasedBufferDeallocation.md)
-    pass.
-
-After buffer deallocation has been completed, the program will be quite
-difficult to transform due to the presence of the deallocation ops. Thus, other
-optimizations such as linalg fusion on memrefs should be done before that stage.
-
-### General structure of the bufferization process
-
-Bufferization consists of running multiple *partial* bufferization passes,
-followed by one *finalizing* bufferization pass.
-
-There is typically one partial bufferization pass per dialect (though other
-subdivisions are possible). For example, for a dialect `X` there will typically
-be a pass `X-bufferize` that knows how to bufferize all the ops in that dialect.
-By running pass `X-bufferize` for each dialect `X` in the program, all the ops
-in the program are incrementally bufferized.
-
-Partial bufferization passes create programs where only some ops have been
-bufferized. These passes will create *materializations* (also sometimes called
-"casts") that convert between the `tensor` and `memref` type, which allows
-bridging between ops that have been bufferized and ops that have not yet been
-bufferized.
-
-Finalizing bufferizations complete the bufferization process, and guarantee that
-there are no tensors remaining in the program. This involves eliminating the
-materializations. The pass `finalizing-bufferize` provides a minimal pass that
-only eliminates materializations and issues an error if any unbufferized ops
-exist in the program.
-
-However, it is possible for a finalizing bufferization to do more than just
-eliminate materializations. By adding patterns (just as a partial bufferization
-would), it is possible for a finalizing bufferization pass to simultaneously
-bufferize ops and eliminate materializations. This has a number of disadvantages
-discussed in the talk and should generally be avoided.
-
-### Example
-
-As a concrete example, we will look at the bufferization pipeline from the
-`mlir-npcomp` reference backend
-([code](https://github.com/llvm/mlir-npcomp/blob/97d6d04d41216e73d40b89ffd79620973fc14ce3/lib/RefBackend/RefBackend.cpp#L232)).
-The code, slightly simplified and annotated, is reproduced here:
-
-```c++
-  // Partial bufferization passes.
-  pm.addPass(createTensorConstantBufferizePass());
-  pm.addNestedPass<func::FuncOp>(createTCPBufferizePass()); // Bufferizes the downstream `tcp` dialect.
-  pm.addNestedPass<func::FuncOp>(createLinalgBufferizePass());
-  pm.addNestedPass<func::FuncOp>(createTensorBufferizePass());
-  pm.addPass(createFuncBufferizePass());
-
-  // Finalizing bufferization pass.
-  pm.addNestedPass<func::FuncOp>(createFinalizingBufferizePass());
-```
-
-Looking first at the partial bufferization passes, we see that there are a
-sequence of `FuncOp` passes (which run in parallel on functions). These function
-passes are bracketed by `arith-bufferize` and `func-bufferize`, which are module
-passes (and thus serialize the parallel compilation process). These two passes
-must be module passes because they make changes to the top-level module.
-
-The bulk of the bufferization work is done by the function passes. Most of these
-passes are provided as part of the upstream MLIR distribution and bufferize
-their respective dialects (e.g. `abc-bufferize` bufferizes the `abc` dialect).
-The `tcp-bufferize` pass is an exception -- it is a partial bufferization pass
-used to bufferize the downstream `tcp` dialect, and fits in perfectly with all
-the other passes provided upstream.
-
-The last pass is the finalizing bufferization pass. The `mlir-npcomp` reference
-backend has arranged that all ops are bufferized by partial bufferizations, so
-that the upstream `finalizing-bufferize` pass can be used as the finalizing
-bufferization pass. This gives excellent diagnostics when something goes wrong
-with the bufferization process, such as due to an op that wasn't handled by any
-pattern.
-
-### How to write a partial bufferization pass
-
-The contract of a partial bufferization pass is that a subset of ops (or kinds
-of ops, customizable by a ConversionTarget) get bufferized.
-
-A partial bufferization pass is just a pass that uses the
-[dialect conversion](DialectConversion.md) framework to apply
-`ConversionPattern`s with a `tensor` to `memref` type conversion.
-
-To describe how to write such a pass, we will walk through an example, the
-`tensor-bufferize` pass
-([code](https://github.com/llvm/llvm-project/blob/bc8acf2ce8ad6e8c9b1d97b2e02d3f4ad26e1d9d/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp#L23),
-[test](https://github.com/llvm/llvm-project/blob/bc8acf2ce8ad6e8c9b1d97b2e02d3f4ad26e1d9d/mlir/test/Dialect/Tensor/bufferize.mlir#L1))
-that bufferizes the `tensor` dialect. Note that these passes have been replaced
-with a `BufferizableOpInterface`-based implementation in the meantime, so we
-have to take a looker at an older version of the code.
-
-The bulk of the code in the pass will be a set of conversion patterns, with a
-simple example being
-[BufferizeCastOp](https://github.com/llvm/llvm-project/blob/2bf6e443e54604c7818c4d1a1837f3d091023270/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp#L23)).
-
-```
-class BufferizeCastOp : public OpConversionPattern<tensor::CastOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(tensor::CastOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto resultType = getTypeConverter()->convertType(op.getType());
-    rewriter.replaceOpWithNewOp<MemRefCastOp>(op, resultType, adaptor.source());
-    return success();
-  }
-};
-```
-
-See [the talk](#the-talk) for more details on how to write these patterns.
-
-The
-[pass itself](https://github.com/llvm/llvm-project/blob/bc8acf2ce8ad6e8c9b1d97b2e02d3f4ad26e1d9d/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp#L57)
-is very small, and follows the basic pattern of any dialect conversion pass.
-
-```
-void mlir::populateTensorBufferizePatterns(
-    const BufferizeTypeConverter &typeConverter, RewritePatternSet &patterns) {
-  patterns.add<BufferizeCastOp, BufferizeExtractOp>(typeConverter,
-                                                    patterns.getContext());
-}
-
-struct TensorBufferizePass : public TensorBufferizeBase<TensorBufferizePass> {
-  void runOnOperation() override {
-    auto *context = &getContext();
-    BufferizeTypeConverter typeConverter;
-    RewritePatternSet patterns(context);
-    ConversionTarget target(*context);
-
-    populateTensorBufferizePatterns(typeConverter, patterns);
-    target.addIllegalOp<tensor::CastOp, tensor::ExtractOp>();
-    target.addLegalDialect<func::FuncDialect>();
-
-    if (failed(
-            applyPartialConversion(getOperation(), target, std::move(patterns))))
-      signalPassFailure();
-  }
-};
-```
-
-The pass has all the hallmarks of a dialect conversion pass that does type
-conversions: a `TypeConverter`, a `RewritePatternSet`, and a `ConversionTarget`,
-and a call to `applyPartialConversion`. Note that a function
-`populateTensorBufferizePatterns` is separated, so that power users can use the
-patterns independently, if necessary (such as to combine multiple sets of
-conversion patterns into a single conversion call, for performance).
-
-One convenient utility provided by the MLIR bufferization infrastructure is the
-`BufferizeTypeConverter`, which comes pre-loaded with the necessary conversions
-and materializations between `tensor` and `memref`.
-
-In this case, the `BufferizationOpsDialect` is marked as legal, so the
-`bufferization.to_tensor` and `bufferization.to_memref` ops, which are inserted
-automatically by the dialect conversion framework as materializations, are
-legal. There is a helper `populateBufferizeMaterializationLegality`
-([code](https://github.com/llvm/llvm-project/blob/a0b65a7bcd6065688189b3d678c42ed6af9603db/mlir/include/mlir/Transforms/Bufferize.h#L53))
-which helps with this in general.
-
-### Other partial bufferization examples
-
--   `func-bufferize`
-    ([code](https://github.com/llvm/llvm-project/blob/2f5715dc78328215d51d5664c72c632a6dac1046/mlir/lib/Dialect/Func/Transforms/FuncBufferize.cpp#L1),
-    [test](https://github.com/llvm/llvm-project/blob/2f5715dc78328215d51d5664c72c632a6dac1046/mlir/test/Dialect/Func/func-bufferize.mlir#L1))
-
-    -   Bufferizes `func`, `call`, and `BranchOpInterface` ops.
-    -   This is an example of how to bufferize ops that have multi-block
-        regions.
-    -   This is an example of a pass that is not split along dialect
-        subdivisions.
-
-### How to write a finalizing bufferization pass
-
-The contract of a finalizing bufferization pass is that all tensors are gone
-from the program.
-
-The easiest way to write a finalizing bufferize pass is to not write one at all!
-MLIR provides a pass `finalizing-bufferize` which eliminates the
-`bufferization.to_tensor` / `bufferization.to_memref` materialization ops
-inserted by partial bufferization passes and emits an error if that is not
-sufficient to remove all tensors from the program.
-
-This pass is sufficient when partial bufferization passes have bufferized all
-the ops in the program, leaving behind only the materializations. When possible,
-it is recommended to structure your pass pipeline this way, as this has the
-significant advantage that if an op does not get bufferized (due to a missing
-pattern, bug in the code, etc.), `finalizing-bufferize` will emit a nice clean
-error, and the IR seen by `finalizing-bufferize` will only contain only one
-unbufferized op.
-
-However, before the current bufferization infrastructure was put in place,
-bufferization could only be done as a single finalizing bufferization mega-pass
-that used the `populate*BufferizePatterns` functions from multiple dialects to
-simultaneously bufferize everything at once. Thus, one might see code in
-downstream projects structured this way. This structure is not recommended in
-new code. A helper, `populateEliminateBufferizeMaterializationsPatterns`
-([code](https://github.com/llvm/llvm-project/blob/a0b65a7bcd6065688189b3d678c42ed6af9603db/mlir/include/mlir/Transforms/Bufferize.h#L58))
-is available for such passes to provide patterns that eliminate
-`bufferization.to_tensor` and `bufferization.to_memref`.
-
-### Changes since [the talk](#the-talk)
-
--   `func-bufferize` was changed to be a partial conversion pass, and there is a
-    new `finalizing-bufferize` which serves as a general finalizing
-    bufferization pass.
--   Most partial bufferization passes have been reimplemented in terms of
-    `BufferizableOpInterface`. New users should use One-Shot Bufferize instead
-    of dialect conversion-based bufferization.
diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h
index b8a6f08b15981..0a515bbea3b50 100644
--- a/mlir/include/mlir-c/IR.h
+++ b/mlir/include/mlir-c/IR.h
@@ -956,6 +956,15 @@ MLIR_CAPI_EXPORTED MlirOpOperand mlirValueGetFirstUse(MlirValue value);
 MLIR_CAPI_EXPORTED void mlirValueReplaceAllUsesOfWith(MlirValue of,
                                                       MlirValue with);
 
+/// Replace all uses of 'of' value with 'with' value, updating anything in the
+/// IR that uses 'of' to use 'with' instead, except if the user is listed in
+/// 'exceptions'. The 'exceptions' parameter is an array of MlirOperation
+/// pointers with a length of 'numExceptions'.
+MLIR_CAPI_EXPORTED void
+mlirValueReplaceAllUsesExcept(MlirValue of, MlirValue with,
+                              intptr_t numExceptions,
+                              MlirOperation *exceptions);
+
 //===----------------------------------------------------------------------===//
 // OpOperand API.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index a887fd7ef1d22..4832d3e79e018 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -1061,8 +1061,7 @@ def AffineVectorStoreOp : AffineStoreOpBase<"vector_store"> {
 // AffineDelinearizeIndexOp
 //===----------------------------------------------------------------------===//
 
-def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index",
-    [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> {
   let summary = "delinearize an index";
   let description = [{
     The `affine.delinearize_index` operation takes a single index value and
@@ -1084,6 +1083,25 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index",
     %indices_1 = affine.apply #map1()[%linear_index]
     %indices_2 = affine.apply #map2()[%linear_index]
     ```
+
+    The basis may either contain `N` or `N-1` elements, where `N` is the number of results.
+    If there are N basis elements, the first one will not be used during computations,
+    but may be used during analysis and canonicalization to eliminate terms from
+    the `affine.delinearize_index` or to enable conclusions about the total size of
+    `%linear_index`.
+
+    If the basis is fully provided, the delinearize_index operation is said to "have
+    an outer bound". The builders assume that an `affine.delinearize_index` has
+    an outer bound by default, as this is how the operation was initially defined.
+
+    That is, the example above could also have been written
+    ```mlir
+    %0:3 = affine.delinearize_index %linear_index into (244, 244) : index, index
+    ```
+
+    Note that, due to the constraints of affine maps, all the basis elements must
+    be strictly positive. A dynamic basis element being 0 or negative causes
+    undefined behavior.
   }];
 
   let arguments = (ins Index:$linear_index,
@@ -1098,17 +1116,27 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index",
   }];
 
   let builders = [
-    OpBuilder<(ins "Value":$linear_index, "ValueRange":$basis)>,
-    OpBuilder<(ins "Value":$linear_index, "ArrayRef<OpFoldResult>":$basis)>,
-    OpBuilder<(ins "Value":$linear_index, "ArrayRef<int64_t>":$basis)>
+    OpBuilder<(ins "Value":$linear_index, "ValueRange":$dynamic_basis, "ArrayRef<int64_t>":$static_asis, CArg<"bool", "true">:$hasOuterBound)>,
+    OpBuilder<(ins "Value":$linear_index, "ValueRange":$basis, CArg<"bool", "true">:$hasOuterBound)>,
+    OpBuilder<(ins "Value":$linear_index, "ArrayRef<OpFoldResult>":$basis, CArg<"bool", "true">:$hasOuterBound)>,
+    OpBuilder<(ins "Value":$linear_index, "ArrayRef<int64_t>":$basis, CArg<"bool", "true">:$hasOuterBound)>
   ];
 
   let extraClassDeclaration = [{
+    /// Return true if the basis includes a bound on the first index input.
+    bool hasOuterBound() {
+      return getMultiIndex().size() == getStaticBasis().size();
+    }
+
     /// Returns a vector with all the static and dynamic basis values.
     SmallVector<OpFoldResult> getMixedBasis() {
       OpBuilder builder(getContext());
       return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder);
     }
+
+    /// Return a vector that contains the basis of the operation, removing
+    /// the outer bound if one is present.
+    SmallVector<OpFoldResult> getEffectiveBasis();
   }];
 
   let hasVerifier = 1;
@@ -1126,13 +1154,21 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
     The `affine.linearize_index` operation takes a sequence of index values and a
     basis of the same length and linearizes the indices using that basis.
 
-    That is, for indices `%idx_1` through `%idx_N` and basis elements `b_1` through `b_N`,
-    it computes
+    That is, for indices `%idx_0` to `%idx_{N-1}` and basis elements `b_0`
+    (or `b_1`) up to `b_{N-1}` it computes
 
     ```
-    sum(i = 1 to N) %idx_i * product(j = i + 1 to N) B_j
+    sum(i = 0 to N-1) %idx_i * product(j = i + 1 to N-1) B_j
     ```
 
+    The basis may either have `N` or `N-1` elements, where `N` is the number of
+    inputs to linearize_index. If `N` inputs are provided, the first one is not used
+    in computation, but may be used during analysis or canonicalization as a bound
+    on `%idx_0`.
+
+    If all `N` basis elements are provided, the linearize_index operation is said to
+    "have an outer bound".
+
     If the `disjoint` property is present, this is an optimization hint that,
     for all `i`, `0 <= %idx_i < B_i` - that is, no index affects any other index,
     except that `%idx_0` may be negative to make the index as a whole negative.
@@ -1142,7 +1178,9 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
     Example:
 
     ```mlir
-    %linear_index = affine.linearize_index [%index_0, %index_1, %index_2] (2, 3, 5) : index
+    %linear_index = affine.linearize_index [%index_0, %index_1, %index_2] by (2, 3, 5) : index
+    // Same effect
+    %linear_index = affine.linearize_index [%index_0, %index_1, %index_2] by (3, 5) : index
     ```
 
     In the above example, `%linear_index` conceptually holds the following:
@@ -1173,12 +1211,20 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
   ];
 
   let extraClassDeclaration = [{
+    /// Return true if the basis includes a bound on the first index input.
+    bool hasOuterBound() {
+      return getMultiIndex().size() == getStaticBasis().size();
+    }
+
     /// Return a vector with all the static and dynamic basis values.
     SmallVector<OpFoldResult> getMixedBasis() {
       OpBuilder builder(getContext());
       return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder);
     }
 
+    /// Return a vector that contains the basis of the operation, removing
+    /// the outer bound if one is present.
+    SmallVector<OpFoldResult> getEffectiveBasis();
   }];
 
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index 402b6cbc0a966..25edc3970d070 100644
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -313,17 +313,23 @@ struct DivModValue {
 DivModValue getDivMod(OpBuilder &b, Location loc, Value lhs, Value rhs);
 
 /// Generate the IR to delinearize `linearIndex` given the `basis` and return
-/// the multi-index.
+/// the multi-index. `hasOuterBound` indicates whether `basis` has an entry
+/// given the size of the first multi-index result - if it is true, the function
+/// will return `basis.size()` values, otherwise, it will return `basis.size() +
+/// 1`.
 FailureOr<SmallVector<Value>> delinearizeIndex(OpBuilder &b, Location loc,
                                                Value linearIndex,
-                                               ArrayRef<Value> basis);
+                                               ArrayRef<Value> basis,
+                                               bool hasOuterBound = true);
 
 FailureOr<SmallVector<Value>> delinearizeIndex(OpBuilder &b, Location loc,
                                                Value linearIndex,
-                                               ArrayRef<OpFoldResult> basis);
+                                               ArrayRef<OpFoldResult> basis,
+                                               bool hasOuterBound = true);
 
 // Generate IR that extracts the linear index from a multi-index according to
-// a basis/shape.
+// a basis/shape. The basis may contain either `multiIndex.size()` or
+// `multiIndex.size() - 1` elements.
 OpFoldResult linearizeIndex(ArrayRef<OpFoldResult> multiIndex,
                             ArrayRef<OpFoldResult> basis,
                             ImplicitLocOpBuilder &builder);
diff --git a/mlir/include/mlir/Dialect/Func/Transforms/Passes.h b/mlir/include/mlir/Dialect/Func/Transforms/Passes.h
index 011ad3e3d0be4..02fc9e1d93439 100644
--- a/mlir/include/mlir/Dialect/Func/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Func/Transforms/Passes.h
@@ -29,9 +29,6 @@ namespace func {
 #define GEN_PASS_DECL
 #include "mlir/Dialect/Func/Transforms/Passes.h.inc"
 
-/// Creates an instance of func bufferization pass.
-std::unique_ptr<Pass> createFuncBufferizePass();
-
 /// Pass to deduplicate functions.
 std::unique_ptr<Pass> createDuplicateFunctionEliminationPass();
 
diff --git a/mlir/include/mlir/Dialect/Func/Transforms/Passes.td b/mlir/include/mlir/Dialect/Func/Transforms/Passes.td
index 01d9f96f03963..22833946ab880 100644
--- a/mlir/include/mlir/Dialect/Func/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Func/Transforms/Passes.td
@@ -11,35 +11,6 @@
 
 include "mlir/Pass/PassBase.td"
 
-def FuncBufferize : Pass<"func-bufferize", "ModuleOp"> {
-  let summary = "Bufferize func/call/return ops";
-  let description = [{
-    A bufferize pass that bufferizes func.func and func.call ops.
-
-    Because this pass updates func.func ops, it must be a module pass. It is
-    useful to keep this pass separate from other bufferizations so that the
-    other ones can be run at function-level in parallel.
-
-    This pass must be done atomically because it changes func op signatures,
-    which requires atomically updating calls as well throughout the entire
-    module.
-
-    This pass also changes the type of block arguments, which requires that all
-    successor arguments of predecessors be converted. This is achieved by
-    rewriting terminators based on the information provided by the
-    `BranchOpInterface`.
-    As this pass rewrites function operations, it also rewrites the
-    corresponding return operations. Other return-like operations that
-    implement the `ReturnLike` trait are not rewritten in general, as they
-    require that the corresponding parent operation is also rewritten.
-    Finally, this pass fails for unknown terminators, as we cannot decide
-    whether they need rewriting.
-  }];
-  let constructor = "mlir::func::createFuncBufferizePass()";
-  let dependentDialects = ["bufferization::BufferizationDialect",
-                           "memref::MemRefDialect"];
-}
-
 def DuplicateFunctionEliminationPass : Pass<"duplicate-function-elimination",
     "ModuleOp"> {
   let summary = "Deduplicate functions";
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
index 6d7cb5ca7a7f8..d4b16a1de8edd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -14,6 +14,7 @@
 #define MLIR_DIALECT_GPU_IR_COMPILATIONINTERFACES_H
 
 #include "mlir/IR/Attributes.h"
+#include "llvm/IR/Module.h"
 
 namespace llvm {
 class IRBuilderBase;
@@ -52,7 +53,11 @@ class TargetOptions {
       StringRef toolkitPath = {}, ArrayRef<std::string> linkFiles = {},
       StringRef cmdOptions = {},
       CompilationTarget compilationTarget = getDefaultCompilationTarget(),
-      function_ref<SymbolTable *()> getSymbolTableCallback = {});
+      function_ref<SymbolTable *()> getSymbolTableCallback = {},
+      function_ref<void(llvm::Module &)> initialLlvmIRCallback = {},
+      function_ref<void(llvm::Module &)> linkedLlvmIRCallback = {},
+      function_ref<void(llvm::Module &)> optimizedLlvmIRCallback = {},
+      function_ref<void(StringRef)> isaCallback = {});
 
   /// Returns the typeID.
   TypeID getTypeID() const;
@@ -80,6 +85,22 @@ class TargetOptions {
   /// table.
   SymbolTable *getSymbolTable() const;
 
+  /// Returns the callback invoked with the initial LLVM IR for the device
+  /// module.
+  function_ref<void(llvm::Module &)> getInitialLlvmIRCallback() const;
+
+  /// Returns the callback invoked with LLVM IR for the device module
+  /// after linking the device libraries.
+  function_ref<void(llvm::Module &)> getLinkedLlvmIRCallback() const;
+
+  /// Returns the callback invoked with LLVM IR for the device module after
+  /// LLVM optimizations but before codegen.
+  function_ref<void(llvm::Module &)> getOptimizedLlvmIRCallback() const;
+
+  /// Returns the callback invoked with the target ISA for the device,
+  /// for example PTX assembly.
+  function_ref<void(StringRef)> getISACallback() const;
+
   /// Returns the default compilation target: `CompilationTarget::Fatbin`.
   static CompilationTarget getDefaultCompilationTarget();
 
@@ -90,7 +111,11 @@ class TargetOptions {
       TypeID typeID, StringRef toolkitPath = {},
       ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
       CompilationTarget compilationTarget = getDefaultCompilationTarget(),
-      function_ref<SymbolTable *()> getSymbolTableCallback = {});
+      function_ref<SymbolTable *()> getSymbolTableCallback = {},
+      function_ref<void(llvm::Module &)> initialLlvmIRCallback = {},
+      function_ref<void(llvm::Module &)> linkedLlvmIRCallback = {},
+      function_ref<void(llvm::Module &)> optimizedLlvmIRCallback = {},
+      function_ref<void(StringRef)> isaCallback = {});
 
   /// Path to the target toolkit.
   std::string toolkitPath;
@@ -109,6 +134,21 @@ class TargetOptions {
   /// being serialized.
   function_ref<SymbolTable *()> getSymbolTableCallback;
 
+  /// Callback invoked with the initial LLVM IR for the device module.
+  function_ref<void(llvm::Module &)> initialLlvmIRCallback;
+
+  /// Callback invoked with LLVM IR for the device module after
+  /// linking the device libraries.
+  function_ref<void(llvm::Module &)> linkedLlvmIRCallback;
+
+  /// Callback invoked with LLVM IR for the device module after
+  /// LLVM optimizations but before codegen.
+  function_ref<void(llvm::Module &)> optimizedLlvmIRCallback;
+
+  /// Callback invoked with the target ISA for the device,
+  /// for example PTX assembly.
+  function_ref<void(StringRef)> isaCallback;
+
 private:
   TypeID typeID;
 };
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 6b462de144d1f..296a3c305e5bf 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -53,6 +53,18 @@ def NVVM_Dialect : Dialect {
     static StringRef getReqntidYName() { return "reqntidy"; }
     static StringRef getReqntidZName() { return "reqntidz"; }
 
+    /// Get the name of the attribute used to annotate exact CTAs required
+    /// per cluster for kernel functions.
+    static StringRef getClusterDimAttrName() { return "nvvm.cluster_dim"; }
+    /// Get the name of the metadata names for each dimension
+    static StringRef getClusterDimXName() { return "cluster_dim_x"; }
+    static StringRef getClusterDimYName() { return "cluster_dim_y"; }
+    static StringRef getClusterDimZName() { return "cluster_dim_z"; }
+
+    /// Get the name of the attribute used to annotate maximum number of
+    /// CTAs per cluster for kernel functions.
+    static StringRef getClusterMaxBlocksAttrName() {  return "nvvm.cluster_max_blocks"; }
+
     /// Get the name of the attribute used to annotate min CTA required
     /// per SM for kernel functions.
     static StringRef getMinctasmAttrName() { return "nvvm.minctasm"; }
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 3695708439d91..71dac3ad39b7b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -197,6 +197,21 @@ def ROCDL_BallotOp :
   let assemblyFormat = "$pred attr-dict `:` type($res)";
 }
 
+def ROCDL_ReadlaneOp : ROCDL_IntrOp<"readlane", [], [0], [AllTypesMatch<["res", "src0"]>], 1>,
+  Arguments<(ins LLVM_Type:$src0,
+                 I32:$src1)> {
+  let results = (outs LLVM_Type:$res);
+  let summary = "Get the value in the specific lane.";
+
+  let description = [{
+    Get the value in lane `src1` from input `src0`.
+  }];
+
+  let assemblyFormat = [{
+    $src0 `,` $src1  attr-dict `:` `(` type($src0) `,` type($src1) `)` `->` type($res)
+   }];
+}
+
 //===----------------------------------------------------------------------===//
 // Thread index and Block index
 
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index a90777c82bf63..37eec6e07963b 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -622,7 +622,8 @@ def MatmulOp : LinalgStructuredBase_Op<"matmul", [
             CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
       [{
         buildMatmulOp($_builder, $_state, std::nullopt, inputs, outputs,
-          attributes, MatmulOp::getRegionBuilder());
+          attributes, MatmulOp::getRegionBuilder(),
+          MatmulOp::getDefaultIndexingMaps($_builder.getContext()));
       }]>,
       OpBuilder<
       (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
@@ -630,16 +631,8 @@ def MatmulOp : LinalgStructuredBase_Op<"matmul", [
             CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
       [{
         buildMatmulOp($_builder, $_state, resultTensorTypes,
-          inputs, outputs, attributes, MatmulOp::getRegionBuilder());
-      }]>,
-      OpBuilder<
-      (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands,
-            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
-      [{
-        $_state.addOperands(operands);
-        $_state.addAttributes(attributes);
-        $_state.addTypes(resultTensorTypes);
-        (void)$_state.addRegion();
+          inputs, outputs, attributes, MatmulOp::getRegionBuilder(),
+          MatmulOp::getDefaultIndexingMaps($_builder.getContext()));
       }]>,
       OpBuilder<
       (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
@@ -648,7 +641,8 @@ def MatmulOp : LinalgStructuredBase_Op<"matmul", [
       [{
         $_state.addAttribute("cast", cast);
         buildMatmulOp($_builder, $_state, resultTensorTypes, inputs, outputs,
-          attributes, MatmulOp::getRegionBuilder());
+          attributes, MatmulOp::getRegionBuilder(),
+          MatmulOp::getDefaultIndexingMaps($_builder.getContext()));
       }]>
 
     ];
@@ -664,7 +658,7 @@ def MatmulOp : LinalgStructuredBase_Op<"matmul", [
                                 Block &block, ArrayRef<NamedAttribute> attrs);
 
       /// Returns a list of AffineMap with the typical matmul indexing charactristic.
-      SmallVector<AffineMap> getDefaultIndexingMaps();
+      static SmallVector<AffineMap> getDefaultIndexingMaps(MLIRContext *context);
 
       /// Returns true if the given broadcast map \p bcastMap is valid for this op.
       bool isValidLhsRhsBroadcastMap(AffineMap bcastMap);
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 42057d8d0c910..e3084530bd11b 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -41,13 +41,12 @@ def ApplyEraseUnnecessaryInputsPatternsOp : Op<Transform_Dialect,
   let assemblyFormat = "attr-dict";
 }
 
-def ApplyGeneralizeTensorPackUnpackPatternsOp
-    : Op<Transform_Dialect, "apply_patterns.linalg.generalize_pack_unpack",
+def ApplyDecomposeTensorPackUnpackPatternsOp
+    : Op<Transform_Dialect, "apply_patterns.linalg.decompose_pack_unpack",
          [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
   let description = [{
-    Collect patterns to generalize tensor.pack and tensor.unpack (i.e. to
-    decompose it into e.g. tensor::PadOp, linalg::transposeOp etc). Requires
-    all outer dims to be unit.
+    Collect patterns to decompose tensor.pack and tensor.unpack into e.g.
+    tensor::PadOp, linalg::transposeOp Ops. Requires all outer dims to be unit.
   }];
 
   let assemblyFormat = "attr-dict";
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 19951f616a43b..fd682c40e4cd7 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1548,7 +1548,7 @@ struct GeneralizePadOpPattern : public OpRewritePattern<tensor::PadOp> {
 ///     into %arg1[0, 0, 0, 0] [1, 1, 2, %tile_dim_1] [1, 1, 1, 1]
 ///     : tensor<2x?xf32> into tensor<1x1x2x?xf32>
 /// ```
-struct GeneralizeOuterUnitDimsPackOpPattern
+struct DecomposeOuterUnitDimsPackOpPattern
     : public OpRewritePattern<tensor::PackOp> {
   using OpRewritePattern<tensor::PackOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(tensor::PackOp packOp,
@@ -1558,7 +1558,7 @@ struct GeneralizeOuterUnitDimsPackOpPattern
 /// Rewrites a tensor::UnPackOp into a sequence of rank-reduced extract_slice op
 /// + transpose op + insert_slice op, where the tensor::UnPackOp has outer dims
 /// being all 1s.
-struct GeneralizeOuterUnitDimsUnPackOpPattern
+struct DecomposeOuterUnitDimsUnPackOpPattern
     : public OpRewritePattern<tensor::UnPackOp> {
   using OpRewritePattern<tensor::UnPackOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp,
@@ -1686,7 +1686,7 @@ void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns,
 /// Populates patterns to decompose tensor.pack and tensor.unpack Ops into e.g.
 /// tensor.pad, linalg.transpose, tensor.{insert|extract}_slice. Require all
 /// outer dims to be unit.
-void populateGeneralizePatterns(RewritePatternSet &patterns);
+void populateDecomposePackUnpackPatterns(RewritePatternSet &patterns);
 
 /// Populates patterns to transform linalg.conv_2d_xxx operations into
 /// linalg.generic (for img2col packing) and linalg.matmul.
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index f79a3eb88e4b5..156e6eb371b85 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -313,6 +313,49 @@ def SingleOp : OpenMP_Op<"single", traits = [
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// 2.8.3 Workshare Construct
+//===----------------------------------------------------------------------===//
+
+def WorkshareOp : OpenMP_Op<"workshare", traits = [
+    RecursiveMemoryEffects,
+  ], clauses = [
+    OpenMP_NowaitClause,
+  ], singleRegion = true> {
+  let summary = "workshare directive";
+  let description = [{
+    The workshare construct divides the execution of the enclosed structured
+    block into separate units of work, and causes the threads of the team to
+    share the work such that each unit is executed only once by one thread, in
+    the context of its implicit task
+
+    This operation is used for the intermediate representation of the workshare
+    block before the work gets divided between the threads. See the flang
+    LowerWorkshare pass for details.
+  }] # clausesDescription;
+
+  let builders = [
+    OpBuilder<(ins CArg<"const WorkshareOperands &">:$clauses)>
+  ];
+}
+
+def WorkshareLoopWrapperOp : OpenMP_Op<"workshare.loop_wrapper", traits = [
+    DeclareOpInterfaceMethods<LoopWrapperInterface>, NoTerminator,
+    RecursiveMemoryEffects, SingleBlock
+  ], singleRegion = true> {
+  let summary = "contains loop nests to be parallelized by workshare";
+  let description = [{
+    This operation wraps a loop nest that is marked for dividing into units of
+    work by an encompassing omp.workshare operation.
+  }];
+
+  let builders = [
+    OpBuilder<(ins), [{ build($_builder, $_state, {}); }]>
+  ];
+  let assemblyFormat = "$region attr-dict";
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Loop Nest
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
index 291f2ef055c8a..de7be3f21f3b1 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
@@ -74,6 +74,12 @@ def SPIRV_AccessChainOp : SPIRV_Op<"AccessChain", [Pure]> {
   let builders = [OpBuilder<(ins "Value":$basePtr, "ValueRange":$indices)>];
 
   let hasCanonicalizer = 1;
+
+  let hasCustomAssemblyFormat = 0;
+
+  let assemblyFormat = [{
+    $base_ptr `[` $indices `]` attr-dict `:` type($base_ptr) `,` type($indices) `->` type(results)
+  }];
 }
 
 // -----
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 239ce0aa8e003..5910aa3f7f2da 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -130,6 +130,11 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
                    "llvm::ArrayRef<OpFoldResult>": $offsets)>,
 
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType> ": $source,
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   "llvm::ArrayRef<OpFoldResult>": $shape,
+                   "llvm::ArrayRef<OpFoldResult>": $strides)>,
+
     OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
                    "llvm::ArrayRef<OpFoldResult>": $offsets,
                    "llvm::ArrayRef<OpFoldResult>": $shape,
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index 4868dad9e1be1..68d4bf7c17e18 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -66,7 +66,7 @@ def TilingInterface : OpInterface<"TilingInterface"> {
         /*desc=*/[{
           Returns a list of iterator types that describe the number of loops.
         }],
-        /*retType=*/"SmallVector<utils::IteratorType>",
+        /*retType=*/"::mlir::SmallVector<utils::IteratorType>",
         /*methodName=*/"getLoopIteratorTypes",
         /*args=*/(ins),
         /*methodBody=*/"",
@@ -77,9 +77,9 @@ def TilingInterface : OpInterface<"TilingInterface"> {
           Returns a list of ranges that describe the loop bounds and
           step for the loops of the operation.
         }],
-        /*retTy=*/"SmallVector<Range>",
+        /*retTy=*/"::mlir::SmallVector<::mlir::Range>",
         /*methodName=*/"getIterationDomain",
-        /*args=*/(ins "OpBuilder &":$b),
+        /*args=*/(ins "::mlir::OpBuilder &":$b),
         /*methodBody=*/"",
         /*defaultImplementation=*/"return {};"
       >,
@@ -104,12 +104,12 @@ def TilingInterface : OpInterface<"TilingInterface"> {
           untiled operation, a `Value` that is the result of the tiled
           operation.
         }],
-        /*retType=*/"FailureOr<::mlir::TilingResult>",
+        /*retType=*/"::mlir::FailureOr<::mlir::TilingResult>",
         /*methodName=*/"getTiledImplementation",
         /*args=*/(ins
-            "OpBuilder &":$b,
-            "ArrayRef<OpFoldResult> ":$offsets,
-            "ArrayRef<OpFoldResult> ":$sizes),
+            "::mlir::OpBuilder &":$b,
+            "::mlir::ArrayRef<::mlir::OpFoldResult> ":$offsets,
+            "::mlir::ArrayRef<::mlir::OpFoldResult> ":$sizes),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return {};
@@ -149,12 +149,12 @@ def TilingInterface : OpInterface<"TilingInterface"> {
         /*retType=*/"::llvm::LogicalResult",
         /*methodName=*/"getResultTilePosition",
         /*args=*/(ins
-          "OpBuilder &":$b,
+          "::mlir::OpBuilder &":$b,
           "unsigned":$resultNumber,
-          "ArrayRef<OpFoldResult> ":$offsets,
-          "ArrayRef<OpFoldResult> ":$sizes,
-          "SmallVector<OpFoldResult> &":$resultOffsets,
-          "SmallVector<OpFoldResult> &":$resultSizes),
+          "::mlir::ArrayRef<::mlir::OpFoldResult> ":$offsets,
+          "::mlir::ArrayRef<::mlir::OpFoldResult> ":$sizes,
+          "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultOffsets,
+          "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultSizes),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
@@ -187,13 +187,13 @@ def TilingInterface : OpInterface<"TilingInterface"> {
           used in the consumer to access). This is upto the caller to handle
           appropriately.
         }],
-        /*retType=*/"FailureOr<::mlir::TilingResult>",
+        /*retType=*/"::mlir::FailureOr<::mlir::TilingResult>",
         /*methodName=*/"generateResultTileValue",
         /*args=*/(ins
-          "OpBuilder &":$b,
+          "::mlir::OpBuilder &":$b,
           "unsigned":$resultNumber,
-          "ArrayRef<OpFoldResult>":$offsets,
-          "ArrayRef<OpFoldResult>":$sizes),
+          "::mlir::ArrayRef<::mlir::OpFoldResult>":$offsets,
+          "::mlir::ArrayRef<::mlir::OpFoldResult>":$sizes),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
@@ -217,13 +217,13 @@ def TilingInterface : OpInterface<"TilingInterface"> {
           If it is illegal to fuse with a producer along the given operand for
           an operation, the implementation should return a failure.
         }],
-        /*retType=*/"FailureOr<::mlir::TilingResult>",
+        /*retType=*/"::mlir::FailureOr<::mlir::TilingResult>",
         /*methodName=*/"getTiledImplementationFromOperandTile",
         /*args=*/(ins
-          "OpBuilder &":$b,
+          "::mlir::OpBuilder &":$b,
           "unsigned":$operandNumber,
-          "ArrayRef<OpFoldResult>":$offsets,
-          "ArrayRef<OpFoldResult>":$sizes),
+          "::mlir::ArrayRef<::mlir::OpFoldResult>":$offsets,
+          "::mlir::ArrayRef<::mlir::OpFoldResult>":$sizes),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
@@ -292,12 +292,12 @@ def TilingInterface : OpInterface<"TilingInterface"> {
         /*retType=*/"::llvm::LogicalResult",
         /*methodName=*/"getIterationDomainTileFromOperandTile",
         /*args=*/(ins
-          "OpBuilder &":$b,
+          "::mlir::OpBuilder &":$b,
           "unsigned":$operandNumber,
-          "ArrayRef<OpFoldResult> ":$offsets,
-          "ArrayRef<OpFoldResult> ":$sizes,
-          "SmallVectorImpl<OpFoldResult> &":$iterDomainOffsets,
-          "SmallVectorImpl<OpFoldResult> &":$iterDomainSizes),
+          "::mlir::ArrayRef<::mlir::OpFoldResult> ":$offsets,
+          "::mlir::ArrayRef<::mlir::OpFoldResult> ":$sizes,
+          "::mlir::SmallVectorImpl<::mlir::OpFoldResult> &":$iterDomainOffsets,
+          "::mlir::SmallVectorImpl<::mlir::OpFoldResult> &":$iterDomainSizes),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
@@ -327,12 +327,12 @@ def TilingInterface : OpInterface<"TilingInterface"> {
         /*retType=*/"::llvm::LogicalResult",
         /*methodName=*/"getIterationDomainTileFromResultTile",
         /*args=*/(ins
-          "OpBuilder &":$b,
+          "::mlir::OpBuilder &":$b,
           "unsigned":$resultNumber,
-          "ArrayRef<OpFoldResult> ":$offsets,
-          "ArrayRef<OpFoldResult> ":$sizes,
-          "SmallVectorImpl<OpFoldResult> &":$iterDomainOffsets,
-          "SmallVectorImpl<OpFoldResult> &":$iterDomainSizes),
+          "::mlir::ArrayRef<::mlir::OpFoldResult> ":$offsets,
+          "::mlir::ArrayRef<::mlir::OpFoldResult> ":$sizes,
+          "::mlir::SmallVectorImpl<::mlir::OpFoldResult> &":$iterDomainOffsets,
+          "::mlir::SmallVectorImpl<::mlir::OpFoldResult> &":$iterDomainSizes),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
@@ -352,7 +352,7 @@ def TilingInterface : OpInterface<"TilingInterface"> {
         /*retType=*/"::llvm::LogicalResult",
         /*methodName=*/"generateScalarImplementation",
         /*args=*/(ins
-            "OpBuilder &":$b,
+            "::mlir::OpBuilder &":$b,
             "Location ":$loc,
             "ValueRange ":$ivs),
         /*methodBody=*/"",
@@ -377,13 +377,13 @@ def PartialReductionOpInterface : OpInterface<"PartialReductionOpInterface"> {
           operation reduction. The tensor shape is equal to operation result
           shape with new dimension for each non zero tile size.
         }],
-        /*retType=*/"FailureOr<SmallVector<Value>>",
+        /*retType=*/"::mlir::FailureOr<SmallVector<Value>>",
         /*methodName=*/"generateInitialTensorForPartialReduction",
         /*args=*/(ins
-            "OpBuilder &":$b,
+            "::mlir::OpBuilder &":$b,
             "Location":$loc,
-            "ArrayRef<OpFoldResult>":$sizes,
-            "ArrayRef<int>":$reductionDim),
+            "::mlir::ArrayRef<::mlir::OpFoldResult>":$sizes,
+            "::mlir::ArrayRef<int>":$reductionDim),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
@@ -396,15 +396,15 @@ def PartialReductionOpInterface : OpInterface<"PartialReductionOpInterface"> {
           less or equal to the tile size. This is meant to be used with
           `mergeReductions` method which will combine the partial reductions.
         }],
-        /*retType=*/"FailureOr<TilingResult>",
+        /*retType=*/"::mlir::FailureOr<TilingResult>",
         /*methodName=*/"tileToPartialReduction",
         /*args=*/(ins
-            "OpBuilder &":$b,
+            "::mlir::OpBuilder &":$b,
             "Location ":$loc,
             "ValueRange":$init,
-            "ArrayRef<OpFoldResult>":$offsets,
-            "ArrayRef<OpFoldResult>":$sizes,
-            "ArrayRef<int>":$reductionDims),
+            "::mlir::ArrayRef<::mlir::OpFoldResult>":$offsets,
+            "::mlir::ArrayRef<::mlir::OpFoldResult>":$sizes,
+            "::mlir::ArrayRef<int>":$reductionDims),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
@@ -416,13 +416,13 @@ def PartialReductionOpInterface : OpInterface<"PartialReductionOpInterface"> {
           tiled along the reduction dimensions. This will only apply the
           reduction the operation.
         }],
-        /*retType=*/"FailureOr<MergeResult>",
+        /*retType=*/"::mlir::FailureOr<MergeResult>",
         /*methodName=*/"mergeReductions",
         /*args=*/(ins
-            "OpBuilder &":$b,
+            "::mlir::OpBuilder &":$b,
             "Location ":$loc,
             "ValueRange":$partialReduce,
-            "ArrayRef<int>":$reductionDim),
+            "::mlir::ArrayRef<int>":$reductionDim),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return failure();
diff --git a/mlir/include/mlir/Target/LLVM/ModuleToObject.h b/mlir/include/mlir/Target/LLVM/ModuleToObject.h
index e40d7e9a43dd6..07fc55b41ae9c 100644
--- a/mlir/include/mlir/Target/LLVM/ModuleToObject.h
+++ b/mlir/include/mlir/Target/LLVM/ModuleToObject.h
@@ -29,8 +29,13 @@ class ModuleTranslation;
 /// operations being transformed must be translatable into LLVM IR.
 class ModuleToObject {
 public:
-  ModuleToObject(Operation &module, StringRef triple, StringRef chip,
-                 StringRef features = {}, int optLevel = 3);
+  ModuleToObject(
+      Operation &module, StringRef triple, StringRef chip,
+      StringRef features = {}, int optLevel = 3,
+      function_ref<void(llvm::Module &)> initialLlvmIRCallback = {},
+      function_ref<void(llvm::Module &)> linkedLlvmIRCallback = {},
+      function_ref<void(llvm::Module &)> optimizedLlvmIRCallback = {},
+      function_ref<void(StringRef)> isaCallback = {});
   virtual ~ModuleToObject();
 
   /// Returns the operation being serialized.
@@ -114,6 +119,21 @@ class ModuleToObject {
   /// Optimization level.
   int optLevel;
 
+  /// Callback invoked with the initial LLVM IR for the device module.
+  function_ref<void(llvm::Module &)> initialLlvmIRCallback;
+
+  /// Callback invoked with LLVM IR for the device module after
+  /// linking the device libraries.
+  function_ref<void(llvm::Module &)> linkedLlvmIRCallback;
+
+  /// Callback invoked with LLVM IR for the device module after
+  /// LLVM optimizations but before codegen.
+  function_ref<void(llvm::Module &)> optimizedLlvmIRCallback;
+
+  /// Callback invoked with the target ISA for the device,
+  /// for example PTX assembly.
+  function_ref<void(StringRef)> isaCallback;
+
 private:
   /// The TargetMachine created for the given Triple, if available.
   /// Accessible through `getOrCreateTargetMachine()`.
diff --git a/mlir/lib/Bindings/Python/DialectQuant.cpp b/mlir/lib/Bindings/Python/DialectQuant.cpp
index af9cdc7bdd2d8..9a871f2c122d1 100644
--- a/mlir/lib/Bindings/Python/DialectQuant.cpp
+++ b/mlir/lib/Bindings/Python/DialectQuant.cpp
@@ -250,6 +250,7 @@ static void populateDialectQuantSubmodule(const py::module &m) {
           double scale = mlirUniformQuantizedPerAxisTypeGetScale(type, i);
           scales.push_back(scale);
         }
+        return scales;
       },
       "The scales designate the difference between the real values "
       "corresponding to consecutive quantized values differing by 1. The ith "
@@ -265,6 +266,7 @@ static void populateDialectQuantSubmodule(const py::module &m) {
               mlirUniformQuantizedPerAxisTypeGetZeroPoint(type, i);
           zeroPoints.push_back(zeroPoint);
         }
+        return zeroPoints;
       },
       "the storage values corresponding to the real value 0 in the affine "
       "equation. The ith zero point corresponds to the ith slice in the "
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 3562ff38201dc..3e96f8c60ba7c 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -178,6 +178,12 @@ static const char kValueReplaceAllUsesWithDocstring[] =
 the IR that uses 'self' to use the other value instead.
 )";
 
+static const char kValueReplaceAllUsesExceptDocstring[] =
+    R"("Replace all uses of this value with the 'with' value, except for those
+in 'exceptions'. 'exceptions' can be either a single operation or a list of
+operations.
+)";
+
 //------------------------------------------------------------------------------
 // Utilities.
 //------------------------------------------------------------------------------
@@ -3718,6 +3724,29 @@ void mlir::python::populateIRCore(py::module &m) {
             mlirValueReplaceAllUsesOfWith(self.get(), with.get());
           },
           kValueReplaceAllUsesWithDocstring)
+      .def(
+          "replace_all_uses_except",
+          [](MlirValue self, MlirValue with, PyOperation &exception) {
+            MlirOperation exceptedUser = exception.get();
+            mlirValueReplaceAllUsesExcept(self, with, 1, &exceptedUser);
+          },
+          py::arg("with"), py::arg("exceptions"),
+          kValueReplaceAllUsesExceptDocstring)
+      .def(
+          "replace_all_uses_except",
+          [](MlirValue self, MlirValue with, py::list exceptions) {
+            // Convert Python list to a SmallVector of MlirOperations
+            llvm::SmallVector<MlirOperation> exceptionOps;
+            for (py::handle exception : exceptions) {
+              exceptionOps.push_back(exception.cast<PyOperation &>().get());
+            }
+
+            mlirValueReplaceAllUsesExcept(
+                self, with, static_cast<intptr_t>(exceptionOps.size()),
+                exceptionOps.data());
+          },
+          py::arg("with"), py::arg("exceptions"),
+          kValueReplaceAllUsesExceptDocstring)
       .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
            [](PyValue &self) { return self.maybeDownCast(); });
   PyBlockArgument::bind(m);
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index e7e6b11c81b9d..24dc885404853 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -28,6 +28,7 @@
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Parser/Parser.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/ThreadPool.h"
 
 #include <cstddef>
@@ -1009,6 +1010,20 @@ void mlirValueReplaceAllUsesOfWith(MlirValue oldValue, MlirValue newValue) {
   unwrap(oldValue).replaceAllUsesWith(unwrap(newValue));
 }
 
+void mlirValueReplaceAllUsesExcept(MlirValue oldValue, MlirValue newValue,
+                                   intptr_t numExceptions,
+                                   MlirOperation *exceptions) {
+  Value oldValueCpp = unwrap(oldValue);
+  Value newValueCpp = unwrap(newValue);
+
+  llvm::SmallPtrSet<mlir::Operation *, 4> exceptionSet;
+  for (intptr_t i = 0; i < numExceptions; ++i) {
+    exceptionSet.insert(unwrap(exceptions[i]));
+  }
+
+  oldValueCpp.replaceAllUsesExcept(newValueCpp, exceptionSet);
+}
+
 //===----------------------------------------------------------------------===//
 // OpOperand API.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e253037e0edce..aa4d3b70329fb 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -158,6 +158,10 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
     // TODO: Use ds_swizzle for XOR when step/offsets are constants for better
     // perf.
     switch (op.getMode()) {
+    case gpu::ShuffleMode::DOWN:
+      dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
+                                             adaptor.getOffset());
+      break;
     case gpu::ShuffleMode::XOR:
       dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
                                              adaptor.getOffset());
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index fc3fb6cd84204..f6271822b3b45 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVectorExtras.h"
@@ -4513,62 +4514,81 @@ LogicalResult AffineVectorStoreOp::verify() {
 // DelinearizeIndexOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult AffineDelinearizeIndexOp::inferReturnTypes(
-    MLIRContext *context, std::optional<::mlir::Location> location,
-    ValueRange operands, DictionaryAttr attributes, OpaqueProperties properties,
-    RegionRange regions, SmallVectorImpl<Type> &inferredReturnTypes) {
-  AffineDelinearizeIndexOpAdaptor adaptor(operands, attributes, properties,
-                                          regions);
-  inferredReturnTypes.assign(adaptor.getStaticBasis().size(),
-                             IndexType::get(context));
-  return success();
+void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder,
+                                     OperationState &odsState,
+                                     Value linearIndex, ValueRange dynamicBasis,
+                                     ArrayRef<int64_t> staticBasis,
+                                     bool hasOuterBound) {
+  SmallVector<Type> returnTypes(hasOuterBound ? staticBasis.size()
+                                              : staticBasis.size() + 1,
+                                linearIndex.getType());
+  build(odsBuilder, odsState, returnTypes, linearIndex, dynamicBasis,
+        staticBasis);
 }
 
 void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder,
                                      OperationState &odsState,
-                                     Value linearIndex, ValueRange basis) {
+                                     Value linearIndex, ValueRange basis,
+                                     bool hasOuterBound) {
   SmallVector<Value> dynamicBasis;
   SmallVector<int64_t> staticBasis;
   dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis,
                              staticBasis);
-  build(odsBuilder, odsState, linearIndex, dynamicBasis, staticBasis);
+  build(odsBuilder, odsState, linearIndex, dynamicBasis, staticBasis,
+        hasOuterBound);
 }
 
 void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder,
                                      OperationState &odsState,
                                      Value linearIndex,
-                                     ArrayRef<OpFoldResult> basis) {
+                                     ArrayRef<OpFoldResult> basis,
+                                     bool hasOuterBound) {
   SmallVector<Value> dynamicBasis;
   SmallVector<int64_t> staticBasis;
   dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis);
-  build(odsBuilder, odsState, linearIndex, dynamicBasis, staticBasis);
+  build(odsBuilder, odsState, linearIndex, dynamicBasis, staticBasis,
+        hasOuterBound);
 }
 
 void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder,
                                      OperationState &odsState,
-                                     Value linearIndex,
-                                     ArrayRef<int64_t> basis) {
-  build(odsBuilder, odsState, linearIndex, ValueRange{}, basis);
+                                     Value linearIndex, ArrayRef<int64_t> basis,
+                                     bool hasOuterBound) {
+  build(odsBuilder, odsState, linearIndex, ValueRange{}, basis, hasOuterBound);
 }
 
 LogicalResult AffineDelinearizeIndexOp::verify() {
-  if (getStaticBasis().empty())
-    return emitOpError("basis should not be empty");
-  if (getNumResults() != getStaticBasis().size())
-    return emitOpError("should return an index for each basis element");
-  auto dynamicMarkersCount =
-      llvm::count_if(getStaticBasis(), ShapedType::isDynamic);
+  ArrayRef<int64_t> staticBasis = getStaticBasis();
+  if (getNumResults() != staticBasis.size() &&
+      getNumResults() != staticBasis.size() + 1)
+    return emitOpError("should return an index for each basis element and up "
+                       "to one extra index");
+
+  auto dynamicMarkersCount = llvm::count_if(staticBasis, ShapedType::isDynamic);
   if (static_cast<size_t>(dynamicMarkersCount) != getDynamicBasis().size())
     return emitOpError(
         "mismatch between dynamic and static basis (kDynamic marker but no "
         "corresponding dynamic basis entry) -- this can only happen due to an "
         "incorrect fold/rewrite");
+
+  if (!llvm::all_of(staticBasis, [](int64_t v) {
+        return v > 0 || ShapedType::isDynamic(v);
+      }))
+    return emitOpError("no basis element may be statically non-positive");
+
   return success();
 }
 
 LogicalResult
 AffineDelinearizeIndexOp::fold(FoldAdaptor adaptor,
                                SmallVectorImpl<OpFoldResult> &result) {
+  // If we won't be doing any division or modulo (no basis or the one basis
+  // element is purely advisory), simply return the input value.
+  if (getNumResults() == 1) {
+    result.push_back(getLinearIndex());
+    return success();
+  }
+
   if (adaptor.getLinearIndex() == nullptr)
     return failure();
 
@@ -4577,7 +4597,11 @@ AffineDelinearizeIndexOp::fold(FoldAdaptor adaptor,
 
   int64_t highPart = cast<IntegerAttr>(adaptor.getLinearIndex()).getInt();
   Type attrType = getLinearIndex().getType();
-  for (int64_t modulus : llvm::reverse(getStaticBasis().drop_front())) {
+
+  ArrayRef<int64_t> staticBasis = getStaticBasis();
+  if (hasOuterBound())
+    staticBasis = staticBasis.drop_front();
+  for (int64_t modulus : llvm::reverse(staticBasis)) {
     result.push_back(IntegerAttr::get(attrType, llvm::mod(highPart, modulus)));
     highPart = llvm::divideFloorSigned(highPart, modulus);
   }
@@ -4586,6 +4610,20 @@ AffineDelinearizeIndexOp::fold(FoldAdaptor adaptor,
   return success();
 }
 
+SmallVector<OpFoldResult> AffineDelinearizeIndexOp::getEffectiveBasis() {
+  OpBuilder builder(getContext());
+  if (hasOuterBound()) {
+    if (getStaticBasis().front() == ::mlir::ShapedType::kDynamic)
+      return getMixedValues(getStaticBasis().drop_front(),
+                            getDynamicBasis().drop_front(), builder);
+
+    return getMixedValues(getStaticBasis().drop_front(), getDynamicBasis(),
+                          builder);
+  }
+
+  return getMixedValues(getStaticBasis(), getDynamicBasis(), builder);
+}
+
 namespace {
 
 // Drops delinearization indices that correspond to unit-extent basis
@@ -4604,24 +4642,25 @@ struct DropUnitExtentBasis
       return zero.value();
     };
 
+    bool hasOuterBound = delinearizeOp.hasOuterBound();
     // Replace all indices corresponding to unit-extent basis with 0.
     // Remaining basis can be used to get a new `affine.delinearize_index` op.
-    SmallVector<OpFoldResult> newOperands;
+    SmallVector<OpFoldResult> newBasis;
     for (auto [index, basis] : llvm::enumerate(delinearizeOp.getMixedBasis())) {
       std::optional<int64_t> basisVal = getConstantIntValue(basis);
       if (basisVal && *basisVal == 1)
-        replacements[index] = getZero();
+        replacements[index + (hasOuterBound ? 0 : 1)] = getZero();
       else
-        newOperands.push_back(basis);
+        newBasis.push_back(basis);
     }
 
-    if (newOperands.size() == delinearizeOp.getStaticBasis().size())
+    if (newBasis.size() == delinearizeOp.getStaticBasis().size())
       return rewriter.notifyMatchFailure(delinearizeOp,
                                          "no unit basis elements");
 
-    if (!newOperands.empty()) {
+    if (!newBasis.empty() || !hasOuterBound) {
       auto newDelinearizeOp = rewriter.create<affine::AffineDelinearizeIndexOp>(
-          loc, delinearizeOp.getLinearIndex(), newOperands);
+          loc, delinearizeOp.getLinearIndex(), newBasis, hasOuterBound);
       int newIndex = 0;
       // Map back the new delinearized indices to the values they replace.
       for (auto &replacement : replacements) {
@@ -4636,27 +4675,6 @@ struct DropUnitExtentBasis
   }
 };
 
-/// Drop delinearization with a single basis element
-///
-/// By definition, `delinearize_index %linear into (%basis)` is
-/// `%linear floorDiv 1` (since `1` is the product of the basis elememts,
-/// ignoring the 0th one, and since there is no previous division we need
-/// to use the remainder of). Therefore, a single-element `delinearize`
-/// can be replaced by the underlying linear index.
-struct DropDelinearizeOneBasisElement
-    : public OpRewritePattern<affine::AffineDelinearizeIndexOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(affine::AffineDelinearizeIndexOp delinearizeOp,
-                                PatternRewriter &rewriter) const override {
-    if (delinearizeOp.getStaticBasis().size() != 1)
-      return rewriter.notifyMatchFailure(delinearizeOp,
-                                         "doesn't have a length-1 basis");
-    rewriter.replaceOp(delinearizeOp, delinearizeOp.getLinearIndex());
-    return success();
-  }
-};
-
 /// If a `affine.delinearize_index`'s input is a `affine.linearize_index
 /// disjoint` and the two operations have the same basis, replace the
 /// delinearizeation results with the inputs of the `affine.linearize_index`
@@ -4678,7 +4696,7 @@ struct CancelDelinearizeOfLinearizeDisjointExact
                                          "index doesn't come from linearize");
 
     if (!linearizeOp.getDisjoint() ||
-        linearizeOp.getMixedBasis() != delinearizeOp.getMixedBasis())
+        linearizeOp.getEffectiveBasis() != delinearizeOp.getEffectiveBasis())
       return rewriter.notifyMatchFailure(
           linearizeOp, "not disjoint or basis doesn't match delinearize");
 
@@ -4690,8 +4708,9 @@ struct CancelDelinearizeOfLinearizeDisjointExact
 
 void affine::AffineDelinearizeIndexOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<CancelDelinearizeOfLinearizeDisjointExact,
-                  DropDelinearizeOneBasisElement, DropUnitExtentBasis>(context);
+  patterns
+      .insert<CancelDelinearizeOfLinearizeDisjointExact, DropUnitExtentBasis>(
+          context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -4728,11 +4747,11 @@ void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder,
 }
 
 LogicalResult AffineLinearizeIndexOp::verify() {
-  if (getStaticBasis().empty())
-    return emitOpError("basis should not be empty");
-
-  if (getMultiIndex().size() != getStaticBasis().size())
-    return emitOpError("should be passed an index for each basis element");
+  size_t numIndexes = getMultiIndex().size();
+  size_t numBasisElems = getStaticBasis().size();
+  if (numIndexes != numBasisElems && numIndexes != numBasisElems + 1)
+    return emitOpError("should be passed a basis element for each index except "
+                       "possibly the first");
 
   auto dynamicMarkersCount =
       llvm::count_if(getStaticBasis(), ShapedType::isDynamic);
@@ -4746,6 +4765,14 @@ LogicalResult AffineLinearizeIndexOp::verify() {
 }
 
 OpFoldResult AffineLinearizeIndexOp::fold(FoldAdaptor adaptor) {
+  // No indices linearizes to zero.
+  if (getMultiIndex().empty())
+    return IntegerAttr::get(getResult().getType(), 0);
+
+  // One single index linearizes to itself.
+  if (getMultiIndex().size() == 1)
+    return getMultiIndex().front();
+
   if (llvm::any_of(adaptor.getMultiIndex(),
                    [](Attribute a) { return a == nullptr; }))
     return nullptr;
@@ -4755,16 +4782,35 @@ OpFoldResult AffineLinearizeIndexOp::fold(FoldAdaptor adaptor) {
 
   int64_t result = 0;
   int64_t stride = 1;
-  for (auto [indexAttr, length] :
-       llvm::zip_equal(llvm::reverse(adaptor.getMultiIndex()),
-                       llvm::reverse(getStaticBasis()))) {
+  for (auto [length, indexAttr] :
+       llvm::zip_first(llvm::reverse(getStaticBasis()),
+                       llvm::reverse(adaptor.getMultiIndex()))) {
     result = result + cast<IntegerAttr>(indexAttr).getInt() * stride;
     stride = stride * length;
   }
+  // Handle the index element with no basis element.
+  if (!hasOuterBound())
+    result =
+        result +
+        cast<IntegerAttr>(adaptor.getMultiIndex().front()).getInt() * stride;
 
   return IntegerAttr::get(getResult().getType(), result);
 }
 
+SmallVector<OpFoldResult> AffineLinearizeIndexOp::getEffectiveBasis() {
+  OpBuilder builder(getContext());
+  if (hasOuterBound()) {
+    if (getStaticBasis().front() == ::mlir::ShapedType::kDynamic)
+      return getMixedValues(getStaticBasis().drop_front(),
+                            getDynamicBasis().drop_front(), builder);
+
+    return getMixedValues(getStaticBasis().drop_front(), getDynamicBasis(),
+                          builder);
+  }
+
+  return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder);
+}
+
 namespace {
 /// Rewrite `affine.linearize_index disjoint [%...a, %x, %...b] by (%...c, 1,
 /// %...d)` to `affine.linearize_index disjoint [%...a, %...b] by (%...c,
@@ -4782,14 +4828,20 @@ struct DropLinearizeUnitComponentsIfDisjointOrZero final
 
   LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
-    size_t numIndices = op.getMultiIndex().size();
+    ValueRange multiIndex = op.getMultiIndex();
+    size_t numIndices = multiIndex.size();
     SmallVector<Value> newIndices;
     newIndices.reserve(numIndices);
     SmallVector<OpFoldResult> newBasis;
     newBasis.reserve(numIndices);
 
+    if (!op.hasOuterBound()) {
+      newIndices.push_back(multiIndex.front());
+      multiIndex = multiIndex.drop_front();
+    }
+
     SmallVector<OpFoldResult> basis = op.getMixedBasis();
-    for (auto [index, basisElem] : llvm::zip_equal(op.getMultiIndex(), basis)) {
+    for (auto [index, basisElem] : llvm::zip_equal(multiIndex, basis)) {
       std::optional<int64_t> basisEntry = getConstantIntValue(basisElem);
       if (!basisEntry || *basisEntry != 1) {
         newIndices.push_back(index);
@@ -4818,23 +4870,6 @@ struct DropLinearizeUnitComponentsIfDisjointOrZero final
   }
 };
 
-/// Rewrite `affine.linearize_index [%%x] by (%b)`, into `%x`.
-///
-/// By definition, that operation is `affine.apply affine_map<()[s0] -> (s0)>,`
-/// which is the identity.
-struct DropLinearizeOneBasisElement final
-    : OpRewritePattern<affine::AffineLinearizeIndexOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp op,
-                                PatternRewriter &rewriter) const override {
-    if (op.getStaticBasis().size() != 1 || op.getMultiIndex().size() != 1)
-      return rewriter.notifyMatchFailure(op, "doesn't have a a length-1 basis");
-    rewriter.replaceOp(op, op.getMultiIndex().front());
-    return success();
-  }
-};
-
 /// Cancel out linearize_index(delinearize_index(x, B), B).
 ///
 /// That is, rewrite
@@ -4857,10 +4892,10 @@ struct CancelLinearizeOfDelinearizeExact final
       return rewriter.notifyMatchFailure(
           linearizeOp, "last entry doesn't come from a delinearize");
 
-    if (linearizeOp.getMixedBasis() != delinearizeOp.getMixedBasis())
+    if (linearizeOp.getEffectiveBasis() != delinearizeOp.getEffectiveBasis())
       return rewriter.notifyMatchFailure(
-          linearizeOp,
-          "basis of linearize and delinearize don't match exactly");
+          linearizeOp, "basis of linearize and delinearize don't match exactly "
+                       "(excluding outer bounds)");
 
     if (delinearizeOp.getResults() != linearizeOp.getMultiIndex())
       return rewriter.notifyMatchFailure(
@@ -4891,9 +4926,12 @@ struct DropLinearizeLeadingZero final
     }
 
     SmallVector<OpFoldResult> mixedBasis = op.getMixedBasis();
+    ArrayRef<OpFoldResult> newMixedBasis = mixedBasis;
+    if (op.hasOuterBound())
+      newMixedBasis = newMixedBasis.drop_front();
+
     rewriter.replaceOpWithNewOp<affine::AffineLinearizeIndexOp>(
-        op, op.getMultiIndex().drop_front(),
-        ArrayRef<OpFoldResult>(mixedBasis).drop_front(), op.getDisjoint());
+        op, op.getMultiIndex().drop_front(), newMixedBasis, op.getDisjoint());
     return success();
   }
 };
@@ -4902,7 +4940,6 @@ struct DropLinearizeLeadingZero final
 void affine::AffineLinearizeIndexOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
   patterns.add<CancelLinearizeOfDelinearizeExact, DropLinearizeLeadingZero,
-               DropLinearizeOneBasisElement,
                DropLinearizeUnitComponentsIfDisjointOrZero>(context);
 }
 
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
index 1930e987a33ff..15478e0e1e3a5 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
@@ -36,8 +36,9 @@ struct LowerDelinearizeIndexOps
   using OpRewritePattern<AffineDelinearizeIndexOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(AffineDelinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
-    FailureOr<SmallVector<Value>> multiIndex = delinearizeIndex(
-        rewriter, op->getLoc(), op.getLinearIndex(), op.getMixedBasis());
+    FailureOr<SmallVector<Value>> multiIndex =
+        delinearizeIndex(rewriter, op->getLoc(), op.getLinearIndex(),
+                         op.getEffectiveBasis(), /*hasOuterBound=*/false);
     if (failed(multiIndex))
       return failure();
     rewriter.replaceOp(op, *multiIndex);
@@ -51,6 +52,12 @@ struct LowerLinearizeIndexOps final : OpRewritePattern<AffineLinearizeIndexOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(AffineLinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
+    // Should be folded away, included here for safety.
+    if (op.getMultiIndex().empty()) {
+      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
+      return success();
+    }
+
     SmallVector<OpFoldResult> multiIndex =
         getAsOpFoldResult(op.getMultiIndex());
     OpFoldResult linearIndex =
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index c1e2f78a7213e..6bab289859e87 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -1951,11 +1951,14 @@ static FailureOr<OpFoldResult> composedAffineMultiply(OpBuilder &b,
 
 FailureOr<SmallVector<Value>>
 mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex,
-                               ArrayRef<Value> basis) {
+                               ArrayRef<Value> basis, bool hasOuterBound) {
+  if (hasOuterBound)
+    basis = basis.drop_front();
+
   // Note: the divisors are backwards due to the scan.
   SmallVector<Value> divisors;
   OpFoldResult basisProd = b.getIndexAttr(1);
-  for (OpFoldResult basisElem : llvm::reverse(basis.drop_front())) {
+  for (OpFoldResult basisElem : llvm::reverse(basis)) {
     FailureOr<OpFoldResult> nextProd =
         composedAffineMultiply(b, loc, basisElem, basisProd);
     if (failed(nextProd))
@@ -1978,11 +1981,15 @@ mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex,
 
 FailureOr<SmallVector<Value>>
 mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex,
-                               ArrayRef<OpFoldResult> basis) {
+                               ArrayRef<OpFoldResult> basis,
+                               bool hasOuterBound) {
+  if (hasOuterBound)
+    basis = basis.drop_front();
+
   // Note: the divisors are backwards due to the scan.
   SmallVector<Value> divisors;
   OpFoldResult basisProd = b.getIndexAttr(1);
-  for (OpFoldResult basisElem : llvm::reverse(basis.drop_front())) {
+  for (OpFoldResult basisElem : llvm::reverse(basis)) {
     FailureOr<OpFoldResult> nextProd =
         composedAffineMultiply(b, loc, basisElem, basisProd);
     if (failed(nextProd))
@@ -2012,8 +2019,15 @@ OpFoldResult mlir::affine::linearizeIndex(ArrayRef<OpFoldResult> multiIndex,
 OpFoldResult mlir::affine::linearizeIndex(OpBuilder &builder, Location loc,
                                           ArrayRef<OpFoldResult> multiIndex,
                                           ArrayRef<OpFoldResult> basis) {
-  assert(multiIndex.size() == basis.size());
+  assert(multiIndex.size() == basis.size() ||
+         multiIndex.size() == basis.size() + 1);
   SmallVector<AffineExpr> basisAffine;
+
+  // Add a fake initial size in order to make the later index linearization
+  // computations line up if an outer bound is not provided.
+  if (multiIndex.size() == basis.size() + 1)
+    basisAffine.push_back(getAffineConstantExpr(1, builder.getContext()));
+
   for (size_t i = 0; i < basis.size(); ++i) {
     basisAffine.push_back(getAffineSymbolExpr(i, builder.getContext()));
   }
diff --git a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
index 912853871b7f8..6149b35befe7d 100644
--- a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
@@ -19,6 +19,8 @@ add_mlir_dialect_library(MLIRArithTransforms
   LINK_LIBS PUBLIC
   MLIRAnalysis
   MLIRArithDialect
+  MLIRBufferizationDialect
+  MLIRBufferizationTransforms
   MLIRFuncDialect
   MLIRFuncTransforms
   MLIRInferIntRangeInterface
diff --git a/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt
index a4d1a8f52d78a..cceb452569a58 100644
--- a/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt
@@ -2,7 +2,6 @@ add_mlir_dialect_library(MLIRFuncTransforms
   AnnotateFunctionType.cpp
   DecomposeCallGraphTypes.cpp
   DuplicateFunctionElimination.cpp
-  FuncBufferize.cpp
   FuncConversions.cpp
   OneToNFuncConversions.cpp
 
@@ -13,8 +12,6 @@ add_mlir_dialect_library(MLIRFuncTransforms
   MLIRFuncTransformsIncGen
 
   LINK_LIBS PUBLIC
-  MLIRBufferizationDialect
-  MLIRBufferizationTransforms
   MLIRFuncDialect
   MLIRIR
   MLIRMemRefDialect
diff --git a/mlir/lib/Dialect/Func/Transforms/FuncBufferize.cpp b/mlir/lib/Dialect/Func/Transforms/FuncBufferize.cpp
deleted file mode 100644
index 5f4fed8e4d491..0000000000000
--- a/mlir/lib/Dialect/Func/Transforms/FuncBufferize.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//===- Bufferize.cpp - Bufferization for func ops -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements bufferization of func.func's and func.call's.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Func/Transforms/Passes.h"
-
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_FUNCBUFFERIZE
-#include "mlir/Dialect/Func/Transforms/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-using namespace mlir::func;
-
-namespace {
-struct FuncBufferizePass : public impl::FuncBufferizeBase<FuncBufferizePass> {
-  using FuncBufferizeBase<FuncBufferizePass>::FuncBufferizeBase;
-  void runOnOperation() override {
-    auto module = getOperation();
-    auto *context = &getContext();
-
-    bufferization::BufferizeTypeConverter typeConverter;
-    RewritePatternSet patterns(context);
-    ConversionTarget target(*context);
-
-    populateFunctionOpInterfaceTypeConversionPattern<FuncOp>(patterns,
-                                                             typeConverter);
-    target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
-      return typeConverter.isSignatureLegal(op.getFunctionType()) &&
-             typeConverter.isLegal(&op.getBody());
-    });
-    populateCallOpTypeConversionPattern(patterns, typeConverter);
-    target.addDynamicallyLegalOp<CallOp>(
-        [&](CallOp op) { return typeConverter.isLegal(op); });
-
-    populateBranchOpInterfaceTypeConversionPattern(patterns, typeConverter);
-    populateReturnOpTypeConversionPattern(patterns, typeConverter);
-    target.addLegalOp<ModuleOp, bufferization::ToTensorOp,
-                      bufferization::ToMemrefOp>();
-
-    target.markUnknownOpDynamicallyLegal([&](Operation *op) {
-      return isNotBranchOpInterfaceOrReturnLikeOp(op) ||
-             isLegalForBranchOpInterfaceTypeConversionPattern(op,
-                                                              typeConverter) ||
-             isLegalForReturnOpTypeConversionPattern(op, typeConverter);
-    });
-
-    if (failed(applyFullConversion(module, target, std::move(patterns))))
-      signalPassFailure();
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::func::createFuncBufferizePass() {
-  return std::make_unique<FuncBufferizePass>();
-}
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 956877497d933..d62ea72dcea2f 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2302,17 +2302,31 @@ KernelMetadataAttr KernelTableAttr::lookup(StringAttr key) const {
 TargetOptions::TargetOptions(
     StringRef toolkitPath, ArrayRef<std::string> linkFiles,
     StringRef cmdOptions, CompilationTarget compilationTarget,
-    function_ref<SymbolTable *()> getSymbolTableCallback)
+    function_ref<SymbolTable *()> getSymbolTableCallback,
+    function_ref<void(llvm::Module &)> initialLlvmIRCallback,
+    function_ref<void(llvm::Module &)> linkedLlvmIRCallback,
+    function_ref<void(llvm::Module &)> optimizedLlvmIRCallback,
+    function_ref<void(StringRef)> isaCallback)
     : TargetOptions(TypeID::get<TargetOptions>(), toolkitPath, linkFiles,
-                    cmdOptions, compilationTarget, getSymbolTableCallback) {}
+                    cmdOptions, compilationTarget, getSymbolTableCallback,
+                    initialLlvmIRCallback, linkedLlvmIRCallback,
+                    optimizedLlvmIRCallback, isaCallback) {}
 
 TargetOptions::TargetOptions(
     TypeID typeID, StringRef toolkitPath, ArrayRef<std::string> linkFiles,
     StringRef cmdOptions, CompilationTarget compilationTarget,
-    function_ref<SymbolTable *()> getSymbolTableCallback)
+    function_ref<SymbolTable *()> getSymbolTableCallback,
+    function_ref<void(llvm::Module &)> initialLlvmIRCallback,
+    function_ref<void(llvm::Module &)> linkedLlvmIRCallback,
+    function_ref<void(llvm::Module &)> optimizedLlvmIRCallback,
+    function_ref<void(StringRef)> isaCallback)
     : toolkitPath(toolkitPath.str()), linkFiles(linkFiles),
       cmdOptions(cmdOptions.str()), compilationTarget(compilationTarget),
-      getSymbolTableCallback(getSymbolTableCallback), typeID(typeID) {}
+      getSymbolTableCallback(getSymbolTableCallback),
+      initialLlvmIRCallback(initialLlvmIRCallback),
+      linkedLlvmIRCallback(linkedLlvmIRCallback),
+      optimizedLlvmIRCallback(optimizedLlvmIRCallback),
+      isaCallback(isaCallback), typeID(typeID) {}
 
 TypeID TargetOptions::getTypeID() const { return typeID; }
 
@@ -2326,6 +2340,25 @@ SymbolTable *TargetOptions::getSymbolTable() const {
   return getSymbolTableCallback ? getSymbolTableCallback() : nullptr;
 }
 
+function_ref<void(llvm::Module &)>
+TargetOptions::getInitialLlvmIRCallback() const {
+  return initialLlvmIRCallback;
+}
+
+function_ref<void(llvm::Module &)>
+TargetOptions::getLinkedLlvmIRCallback() const {
+  return linkedLlvmIRCallback;
+}
+
+function_ref<void(llvm::Module &)>
+TargetOptions::getOptimizedLlvmIRCallback() const {
+  return optimizedLlvmIRCallback;
+}
+
+function_ref<void(StringRef)> TargetOptions::getISACallback() const {
+  return isaCallback;
+}
+
 CompilationTarget TargetOptions::getCompilationTarget() const {
   return compilationTarget;
 }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index d28194d5c0029..ca04af0b060b4 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1126,18 +1126,22 @@ LogicalResult NVVMDialect::verifyOperationAttribute(Operation *op,
                              << "' attribute attached to unexpected op";
     }
   }
-  // If maxntid and reqntid exist, it must be an array with max 3 dim
+  // If maxntid / reqntid / cluster_dim exist, it must be an array with max 3
+  // dim
   if (attrName == NVVMDialect::getMaxntidAttrName() ||
-      attrName == NVVMDialect::getReqntidAttrName()) {
+      attrName == NVVMDialect::getReqntidAttrName() ||
+      attrName == NVVMDialect::getClusterDimAttrName()) {
     auto values = llvm::dyn_cast<DenseI32ArrayAttr>(attr.getValue());
     if (!values || values.empty() || values.size() > 3)
       return op->emitError()
              << "'" << attrName
              << "' attribute must be integer array with maximum 3 index";
   }
-  // If minctasm and maxnreg exist, it must be an integer attribute
+  // If minctasm / maxnreg / cluster_max_blocks exist, it must be an integer
+  // attribute
   if (attrName == NVVMDialect::getMinctasmAttrName() ||
-      attrName == NVVMDialect::getMaxnregAttrName()) {
+      attrName == NVVMDialect::getMaxnregAttrName() ||
+      attrName == NVVMDialect::getClusterMaxBlocksAttrName()) {
     if (!llvm::dyn_cast<IntegerAttr>(attr.getValue()))
       return op->emitError()
              << "'" << attrName << "' attribute must be integer constant";
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index dee8a4e27e6b2..26d9d2b091750 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -155,27 +155,6 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region &region,
   // iterator_types is an auto-generated method.
 }
 
-/// Helper to create a typical indexing map for MatmulOp. Returns a list of
-/// AffineMap.
-static SmallVector<AffineMap, 3>
-getDefaultIndexingMapsForMatmul(MLIRContext *context) {
-  AffineExpr d0, d1, d2;
-  SmallVector<AffineMap, 3> indexingMaps;
-  bindDims(context, d0, d1, d2);
-  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d2}, context));
-  indexingMaps.push_back(AffineMap::get(3, 0, {d2, d1}, context));
-  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d1}, context));
-  return indexingMaps;
-}
-
-/// Wrapper to return the typical indexing map array attribute for MatmulOp.
-static SmallVector<Attribute>
-getDefaultMatmulIndexingMapAttr(MLIRContext *context) {
-  return llvm::map_to_vector(
-      getDefaultIndexingMapsForMatmul(context),
-      [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
-}
-
 /// Creates a structured operation given `inputs`, `outputs`, and `attributes`.
 /// The result types are derived automatically if `resultTensorTypes` is none.
 /// The body of the operation is filled using `regionBuilder`. All ods-gen
@@ -208,24 +187,18 @@ static void buildStructuredOp(OpBuilder &b, OperationState &state,
                          state.attributes.getAttrs(), regionBuilder);
 }
 
-static void
-buildMatmulOp(OpBuilder &b, OperationState &state,
-              std::optional<TypeRange> resultTensorTypes, ValueRange inputs,
-              ValueRange outputs, ArrayRef<NamedAttribute> attributes,
-              RegionBuilderFn regionBuilder,
-              std::optional<ArrayRef<AffineMap>> indexingMaps = std::nullopt) {
-  // Initialize indexingMaps, for MatmulOp.
+static void buildMatmulOp(OpBuilder &b, OperationState &state,
+                          std::optional<TypeRange> resultTensorTypes,
+                          ValueRange inputs, ValueRange outputs,
+                          ArrayRef<NamedAttribute> attributes,
+                          RegionBuilderFn regionBuilder,
+                          ArrayRef<AffineMap> indexingMaps) {
+  // Initialize indexingMaps attribute, for MatmulOp.
   SmallVector<Attribute, 3> indexingMapsAttrVal;
-  if (indexingMaps.has_value()) {
-    for (mlir::AffineMap map : *indexingMaps) {
-      // Convert each AffineMap to an AffineMapAttr
-      indexingMapsAttrVal.push_back(AffineMapAttr::get(map));
-    }
-    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
-  } else {
-    indexingMapsAttrVal = getDefaultMatmulIndexingMapAttr(b.getContext());
-    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
-  }
+  indexingMapsAttrVal = llvm::map_to_vector(
+      MatmulOp::getDefaultIndexingMaps(b.getContext()),
+      [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
+  state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
   return buildStructuredOp(b, state, resultTensorTypes, inputs, outputs,
                            attributes, regionBuilder);
 }
@@ -3457,7 +3430,7 @@ static LogicalResult verifyExtendedMatmulSemantic(MatmulOp matmulOp,
                                                   unsigned opIndex) {
   SmallVector<AffineMap, 3> opIndexingMaps = matmulOp.getIndexingMapsArray();
   SmallVector<AffineMap, 3> defaultIndexingMaps =
-      matmulOp.getDefaultIndexingMaps();
+      matmulOp.getDefaultIndexingMaps(matmulOp->getContext());
 
   auto opIndexingMap = opIndexingMaps[opIndex];
   auto defaultIndexingMap = defaultIndexingMaps[opIndex];
@@ -3484,6 +3457,17 @@ namespace linalg {
 // MatMulOp
 //===----------------------------------------------------------------------===//
 
+/// Returns a list of AffineMap with the typical matmul indexing charactristic.
+SmallVector<AffineMap> MatmulOp::getDefaultIndexingMaps(MLIRContext *context) {
+  AffineExpr d0, d1, d2;
+  SmallVector<AffineMap> indexingMaps;
+  bindDims(context, d0, d1, d2);
+  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d2}, context));
+  indexingMaps.push_back(AffineMap::get(3, 0, {d2, d1}, context));
+  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d1}, context));
+  return indexingMaps;
+}
+
 SmallVector<utils::IteratorType> MatmulOp::getIteratorTypesArray() {
   return SmallVector<utils::IteratorType>{utils::IteratorType::parallel,
                                           utils::IteratorType::parallel,
@@ -3501,7 +3485,8 @@ bool MatmulOp::hasDynamicIndexingMaps() { return true; }
 /// Check if the op has broadcast and/or transpose semantic. Returns true if
 /// the user defined indexing maps are not equal to default map.
 bool MatmulOp::hasUserDefinedMaps() {
-  SmallVector<AffineMap, 3> defaultMaps = getDefaultIndexingMaps();
+  SmallVector<AffineMap, 3> defaultMaps =
+      getDefaultIndexingMaps(this->getContext());
   SmallVector<AffineMap, 3> explicitMaps = getIndexingMapsArray();
   return defaultMaps != explicitMaps;
 }
@@ -3535,13 +3520,6 @@ void MatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block,
   helper.yieldOutputs(yields);
 }
 
-/// Returns a list of AffineMap with the typical matmul indexing
-/// charactristic.
-SmallVector<AffineMap> MatmulOp::getDefaultIndexingMaps() {
-  MLIRContext *context = this->getContext();
-  return getDefaultIndexingMapsForMatmul(context);
-}
-
 /// Returns true if the given broadcast map \p bcastMap is valid for this op.
 bool MatmulOp::isValidLhsRhsBroadcastMap(AffineMap bcastMap) {
   assert(bcastMap.getNumResults() == 1 && "Expected single result dim expr.");
@@ -3578,7 +3556,9 @@ ParseResult MatmulOp::parse(OpAsmParser &parser, OperationState &result) {
   }
   // Initialize indexingMaps, if not supplied explicitly.
   if (indexingMapsAttr.empty()) {
-    indexingMapsAttr = getDefaultMatmulIndexingMapAttr(result.getContext());
+    indexingMapsAttr = llvm::map_to_vector(
+        MatmulOp::getDefaultIndexingMaps(parser.getContext()),
+        [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
   }
   result.addAttribute("indexing_maps",
                       parser.getBuilder().getArrayAttr(indexingMapsAttr));
@@ -3592,8 +3572,9 @@ void MatmulOp::print(OpAsmPrinter &p) {
   printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(),
                          elidedAttrs);
 
-  SmallVector<Attribute, 3> indexingMaps =
-      getDefaultMatmulIndexingMapAttr(getContext());
+  SmallVector<Attribute, 3> indexingMaps = llvm::map_to_vector(
+      MatmulOp::getDefaultIndexingMaps(getContext()),
+      [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
   if (!llvm::equal(getIndexingMaps(), indexingMaps)) {
     p << " indexing_maps = [";
     llvm::interleaveComma(getIndexingMaps(), p,
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index a00c609779c3a..ada80deacfdbf 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -229,9 +229,9 @@ void transform::ApplyEraseUnnecessaryInputsPatternsOp::populatePatterns(
   linalg::populateEraseUnnecessaryInputsPatterns(patterns);
 }
 
-void transform::ApplyGeneralizeTensorPackUnpackPatternsOp::populatePatterns(
+void transform::ApplyDecomposeTensorPackUnpackPatternsOp::populatePatterns(
     RewritePatternSet &patterns) {
-  linalg::populateGeneralizePatterns(patterns);
+  linalg::populateDecomposePackUnpackPatterns(patterns);
 }
 
 void transform::ApplyFoldUnitExtentDimsViaReshapesPatternsOp::populatePatterns(
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index c9eac66367559..d92543d726462 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1138,7 +1138,7 @@ getPackUnpackRankReducedPerm(ArrayRef<int64_t> shape,
   return perm;
 }
 
-LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
+LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
     tensor::PackOp packOp, PatternRewriter &rewriter) const {
   // TODO: support the case that outer dimensions are not all 1s. A
   // tensor.expand_shape will be generated in this case.
@@ -1239,7 +1239,7 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
   return success();
 }
 
-LogicalResult GeneralizeOuterUnitDimsUnPackOpPattern::matchAndRewrite(
+LogicalResult DecomposeOuterUnitDimsUnPackOpPattern::matchAndRewrite(
     tensor::UnPackOp unpackOp, PatternRewriter &rewriter) const {
   int64_t srcRank = unpackOp.getSourceRank();
   int64_t destRank = unpackOp.getDestRank();
@@ -1619,7 +1619,7 @@ void linalg::populateDecomposeConvolutionPatterns(RewritePatternSet &patterns,
       patterns.getContext(), benefit);
 }
 
-void linalg::populateGeneralizePatterns(RewritePatternSet &patterns) {
+void linalg::populateDecomposePackUnpackPatterns(RewritePatternSet &patterns) {
   // TODO: Add and test patterns for tensor.unpack
-  patterns.add<GeneralizeOuterUnitDimsPackOpPattern>(patterns.getContext());
+  patterns.add<DecomposeOuterUnitDimsPackOpPattern>(patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 19e0fa30a7571..94e71e089d4b1 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1897,6 +1897,27 @@ LogicalResult SingleOp::verify() {
                                   getCopyprivateSyms());
 }
 
+//===----------------------------------------------------------------------===//
+// WorkshareOp
+//===----------------------------------------------------------------------===//
+
+void WorkshareOp::build(OpBuilder &builder, OperationState &state,
+                        const WorkshareOperands &clauses) {
+  WorkshareOp::build(builder, state, clauses.nowait);
+}
+
+//===----------------------------------------------------------------------===//
+// WorkshareLoopWrapperOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult WorkshareLoopWrapperOp::verify() {
+  if (!(*this)->getParentOfType<WorkshareOp>())
+    return emitError() << "must be nested in an omp.workshare";
+  if (getNestedWrapper())
+    return emitError() << "cannot be composite";
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // LoopWrapperInterface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp b/mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp
index c4c7ff722175d..154e955d6057a 100644
--- a/mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp
@@ -320,62 +320,12 @@ void AccessChainOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, type, basePtr, indices);
 }
 
-ParseResult AccessChainOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::UnresolvedOperand ptrInfo;
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> indicesInfo;
-  Type type;
-  auto loc = parser.getCurrentLocation();
-  SmallVector<Type, 4> indicesTypes;
-
-  if (parser.parseOperand(ptrInfo) ||
-      parser.parseOperandList(indicesInfo, OpAsmParser::Delimiter::Square) ||
-      parser.parseColonType(type) ||
-      parser.resolveOperand(ptrInfo, type, result.operands)) {
-    return failure();
-  }
-
-  // Check that the provided indices list is not empty before parsing their
-  // type list.
-  if (indicesInfo.empty()) {
-    return mlir::emitError(result.location,
-                           "'spirv.AccessChain' op expected at "
-                           "least one index ");
-  }
-
-  if (parser.parseComma() || parser.parseTypeList(indicesTypes))
-    return failure();
-
-  // Check that the indices types list is not empty and that it has a one-to-one
-  // mapping to the provided indices.
-  if (indicesTypes.size() != indicesInfo.size()) {
-    return mlir::emitError(
-        result.location, "'spirv.AccessChain' op indices types' count must be "
-                         "equal to indices info count");
-  }
-
-  if (parser.resolveOperands(indicesInfo, indicesTypes, loc, result.operands))
-    return failure();
-
-  auto resultType = getElementPtrType(
-      type, llvm::ArrayRef(result.operands).drop_front(), result.location);
-  if (!resultType) {
-    return failure();
-  }
-
-  result.addTypes(resultType);
-  return success();
-}
-
 template <typename Op>
 static void printAccessChain(Op op, ValueRange indices, OpAsmPrinter &printer) {
   printer << ' ' << op.getBasePtr() << '[' << indices
           << "] : " << op.getBasePtr().getType() << ", " << indices.getTypes();
 }
 
-void spirv::AccessChainOp::print(OpAsmPrinter &printer) {
-  printAccessChain(*this, getIndices(), printer);
-}
-
 template <typename Op>
 static LogicalResult verifyAccessChain(Op accessChainOp, ValueRange indices) {
   auto resultType = getElementPtrType(accessChainOp.getBasePtr().getType(),
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index bf7b3f9bec558..25fca49cb0154 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -646,10 +646,11 @@ class SparseLvlOpConverter : public OpConversionPattern<LvlOp> {
   matchAndRewrite(LvlOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     std::optional<int64_t> lvl = op.getConstantLvlIndex();
-    if (!lvl || !getSparseTensorEncoding(adaptor.getSource().getType()))
+    RankedTensorType srcType = op.getSource().getType();
+    if (!lvl || !getSparseTensorEncoding(srcType))
       return failure();
 
-    auto desc = getDescriptorFromTensorTuple(adaptor.getSource());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getSource(), srcType);
     auto sz = desc.getLvlSize(rewriter, op.getLoc(), *lvl);
 
     rewriter.replaceOp(op, sz);
@@ -675,8 +676,9 @@ struct SparseReorderCOOConverter : public OpConversionPattern<ReorderCOOOp> {
     assert(dstStt.hasSameDimToLvl(srcStt));
 
     // We don't need a mutable descriptor here as we perform sorting in-place.
-    auto nnz = genValMemSize(rewriter, op.getLoc(), adaptor.getInputCoo());
-    auto desc = getDescriptorFromTensorTuple(adaptor.getInputCoo());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getInputCoo(),
+                                             op.getInputCoo().getType());
+    auto nnz = desc.getValMemSize(rewriter, op.getLoc());
     auto crd = desc.getAOSMemRef();
     auto val = desc.getValMemRef();
 
@@ -704,7 +706,8 @@ class SparseSliceGetterOpConverter : public OpConversionPattern<Op> {
   matchAndRewrite(Op op, typename Op::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Simply lowers to specifer.get <field> operation.
-    auto desc = getDescriptorFromTensorTuple(adaptor.getSlice());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getSlice(),
+                                             op.getSlice().getType());
     auto v = desc.getSpecifierField(rewriter, op.getLoc(), kind,
                                     op.getDim().getZExtValue());
 
@@ -762,7 +765,8 @@ class SparseTensorAllocConverter
     Location loc = op.getLoc();
     // Deal with copy.
     if (op.getCopy()) {
-      auto desc = getDescriptorFromTensorTuple(adaptor.getCopy());
+      auto desc = getDescriptorFromTensorTuple(
+          adaptor.getCopy(), cast<RankedTensorType>(op.getCopy().getType()));
       SmallVector<Value> fields;
       fields.reserve(desc.getNumFields());
       // Memcpy on memref fields.
@@ -868,7 +872,9 @@ class SparseTensorDeallocConverter
     if (createDeallocs) {
       // Replace the sparse tensor deallocation with field deallocations.
       Location loc = op.getLoc();
-      auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
+      auto desc = getDescriptorFromTensorTuple(
+          adaptor.getTensor(),
+          cast<RankedTensorType>(op.getTensor().getType()));
       for (auto input : desc.getMemRefFields())
         // Deallocate every buffer used to store the sparse tensor handler.
         rewriter.create<memref::DeallocOp>(loc, input);
@@ -889,7 +895,8 @@ class SparseTensorLoadConverter : public OpConversionPattern<LoadOp> {
   matchAndRewrite(LoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Prepare descriptor.
-    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor(),
+                                             op.getTensor().getType());
     // Generate optional insertion finalization code.
     if (op.getHasInserts())
       genEndInsert(rewriter, op.getLoc(), desc);
@@ -909,7 +916,8 @@ class SparseExpandConverter : public OpConversionPattern<ExpandOp> {
     if (!getSparseTensorEncoding(op.getTensor().getType()))
       return failure();
     Location loc = op->getLoc();
-    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor(),
+                                             op.getTensor().getType());
     const auto srcType = getSparseTensorType(op.getTensor());
     Type eltType = srcType.getElementType();
     Type boolType = rewriter.getIntegerType(1);
@@ -959,7 +967,8 @@ class SparseCompressConverter : public OpConversionPattern<CompressOp> {
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     SmallVector<Value> fields;
-    auto desc = getMutDescriptorFromTensorTuple(adaptor.getTensor(), fields);
+    auto desc = getMutDescriptorFromTensorTuple(adaptor.getTensor(), fields,
+                                                op.getTensor().getType());
     Value values = adaptor.getValues();
     Value filled = adaptor.getFilled();
     Value added = adaptor.getAdded();
@@ -1032,7 +1041,8 @@ class SparseInsertConverter : public OpConversionPattern<tensor::InsertOp> {
     assert(stt.isIdentity() && "Run reinterpret-map before conversion.");
 
     Location loc = op.getLoc();
-    auto desc = getDescriptorFromTensorTuple(adaptor.getDest());
+    auto desc =
+        getDescriptorFromTensorTuple(adaptor.getDest(), op.getDest().getType());
     TypeRange flatSpTensorTps = desc.getFields().getTypes();
     SmallVector<Value> params = llvm::to_vector(desc.getFields());
     params.append(adaptor.getIndices().begin(), adaptor.getIndices().end());
@@ -1059,7 +1069,8 @@ class SparseToPositionsConverter : public OpConversionPattern<ToPositionsOp> {
     // of this operation truly observe size, not capacity!
     Location loc = op.getLoc();
     Level lvl = op.getLevel();
-    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor(),
+                                             op.getTensor().getType());
     auto mem = desc.getPosMemRef(lvl);
     auto size = desc.getPosMemSize(rewriter, loc, lvl);
     rewriter.replaceOp(op, genSliceToSize(rewriter, loc, mem, size));
@@ -1081,7 +1092,8 @@ class SparseToCoordinatesConverter
     // of this operation truly observe size, not capacity!
     Location loc = op.getLoc();
     Level lvl = op.getLevel();
-    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor(),
+                                             op.getTensor().getType());
     auto mem = desc.getCrdMemRefOrView(rewriter, loc, lvl);
     if (lvl < getSparseTensorType(op.getTensor()).getAoSCOOStart()) {
       auto size = desc.getCrdMemSize(rewriter, loc, lvl);
@@ -1106,7 +1118,8 @@ class SparseToCoordinatesBufferConverter
     // of this operation truly observe size, not capacity!
     Location loc = op.getLoc();
     Level lvl = getSparseTensorType(op.getTensor()).getAoSCOOStart();
-    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor(),
+                                             op.getTensor().getType());
     auto mem = desc.getAOSMemRef();
     auto size = desc.getCrdMemSize(rewriter, loc, lvl);
     rewriter.replaceOp(op, genSliceToSize(rewriter, loc, mem, size));
@@ -1126,7 +1139,8 @@ class SparseToValuesConverter : public OpConversionPattern<ToValuesOp> {
     // The view is restricted to the actual size to ensure clients
     // of this operation truly observe size, not capacity!
     Location loc = op.getLoc();
-    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor(),
+                                             op.getTensor().getType());
     auto mem = desc.getValMemRef();
     auto size = desc.getValMemSize(rewriter, loc);
     rewriter.replaceOp(op, genSliceToSize(rewriter, loc, mem, size));
@@ -1172,7 +1186,8 @@ class SparseConvertConverter : public OpConversionPattern<ConvertOp> {
     //   else:
     //     dst = memref.copy(src)
     Location loc = op.getLoc();
-    auto srcDesc = getDescriptorFromTensorTuple(adaptor.getSource());
+    auto srcDesc = getDescriptorFromTensorTuple(adaptor.getSource(),
+                                                op.getSource().getType());
     SmallVector<Value> fields;
     foreachFieldAndTypeInSparseTensor(
         SparseTensorType(cast<RankedTensorType>(op.getResult().getType())),
@@ -1236,7 +1251,8 @@ class SparseExtractSliceConverter
     assert(srcEnc.withoutDimSlices() == dstEnc.withoutDimSlices());
 
     SmallVector<Value> fields;
-    auto desc = getMutDescriptorFromTensorTuple(adaptor.getSource(), fields);
+    auto desc = getMutDescriptorFromTensorTuple(adaptor.getSource(), fields,
+                                                op.getSource().getType());
 
     auto newSpec = rewriter.create<StorageSpecifierInitOp>(
         loc, StorageSpecifierType::get(ctx, dstEnc), desc.getSpecifier());
@@ -1285,8 +1301,9 @@ class SparseNumberOfEntriesConverter
     // Query memSizes for the actually stored values.
     // FIXME: the nse value computed in this way might be wrong when there is
     // any "loose_compressed" level.
-    rewriter.replaceOp(
-        op, genValMemSize(rewriter, op.getLoc(), adaptor.getTensor()));
+    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor(),
+                                             op.getTensor().getType());
+    rewriter.replaceOp(op, desc.getValMemSize(rewriter, op.getLoc()));
     return success();
   }
 };
@@ -1415,7 +1432,8 @@ struct SparseDisassembleOpConverter
   LogicalResult
   matchAndRewrite(DisassembleOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getTensor(),
+                                             op.getTensor().getType());
     Location loc = op.getLoc();
     SmallVector<Value> retMem;
     SmallVector<Value> retLen;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
index de553a5f9bf08..f92382472b478 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
@@ -554,11 +554,6 @@ sparse_tensor::genToMemref(OpBuilder &builder, Location loc, Value tensor) {
       .getResult();
 }
 
-Value sparse_tensor::genValMemSize(OpBuilder &builder, Location loc,
-                                   Value tensor) {
-  return getDescriptorFromTensorTuple(tensor).getValMemSize(builder, loc);
-}
-
 Value sparse_tensor::createOrFoldSliceOffsetOp(OpBuilder &builder, Location loc,
                                                Value tensor, Dimension dim) {
   auto enc = getSparseTensorEncoding(tensor.getType());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
index d0ef8a6860bb2..dc017e6baa6dc 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
@@ -270,9 +270,6 @@ void storeAll(OpBuilder &builder, Location loc, Value mem, ValueRange vs,
 TypedValue<BaseMemRefType> genToMemref(OpBuilder &builder, Location loc,
                                        Value tensor);
 
-/// Generates code to retrieve the values size for the sparse tensor.
-Value genValMemSize(OpBuilder &builder, Location loc, Value tensor);
-
 /// Generates code to retrieve the slice offset for the sparse tensor slice,
 /// return a constant if the offset is statically known.
 Value createOrFoldSliceOffsetOp(OpBuilder &builder, Location loc, Value tensor,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
index c2f631605bf4b..89858546e37e1 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
@@ -245,18 +245,18 @@ inline Value genTuple(OpBuilder &builder, Location loc,
   return genTuple(builder, loc, desc.getRankedTensorType(), desc.getFields());
 }
 
-inline SparseTensorDescriptor getDescriptorFromTensorTuple(Value tensor) {
+inline SparseTensorDescriptor
+getDescriptorFromTensorTuple(Value tensor, RankedTensorType type) {
   auto tuple = getTuple(tensor);
-  SparseTensorType stt(cast<RankedTensorType>(tuple.getResultTypes()[0]));
-  return SparseTensorDescriptor(stt, tuple.getInputs());
+  return SparseTensorDescriptor(SparseTensorType(type), tuple.getInputs());
 }
 
 inline MutSparseTensorDescriptor
-getMutDescriptorFromTensorTuple(Value tensor, SmallVectorImpl<Value> &fields) {
+getMutDescriptorFromTensorTuple(Value tensor, SmallVectorImpl<Value> &fields,
+                                RankedTensorType type) {
   auto tuple = getTuple(tensor);
   fields.assign(tuple.getInputs().begin(), tuple.getInputs().end());
-  SparseTensorType stt(cast<RankedTensorType>(tuple.getResultTypes()[0]));
-  return MutSparseTensorDescriptor(stt, fields);
+  return MutSparseTensorDescriptor(SparseTensorType(type), fields);
 }
 
 } // namespace sparse_tensor
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index dc8bab325184b..87c30a733c363 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -110,13 +110,16 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
           .Case<vector::CreateMaskOp>(
               [&](auto createMaskOp) -> std::optional<Operation *> {
                 OperandRange maskOperands = createMaskOp.getOperands();
-                size_t numMaskOperands = maskOperands.size();
+                // The `vector.create_mask` op creates a mask arrangement
+                // without any zeros at the front. Also, because
+                // `numFrontPadElems` is strictly smaller than
+                // `numSrcElemsPerDest`, the compressed mask generated by
+                // padding the original mask by `numFrontPadElems` will not
+                // have any zeros at the front as well.
                 AffineExpr s0;
                 bindSymbols(rewriter.getContext(), s0);
-                s0 = s0 + numSrcElemsPerDest - 1;
-                s0 = s0.floorDiv(numSrcElemsPerDest);
-                OpFoldResult origIndex =
-                    getAsOpFoldResult(maskOperands[numMaskOperands - 1]);
+                s0 = (s0 + numFrontPadElems).ceilDiv(numSrcElemsPerDest);
+                OpFoldResult origIndex = getAsOpFoldResult(maskOperands.back());
                 OpFoldResult maskIndex = affine::makeComposedFoldedAffineApply(
                     rewriter, loc, s0, origIndex);
                 SmallVector<Value> newMaskOperands(maskOperands.drop_back());
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 7f6b2303f86e1..20cd9cba6909a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -596,12 +596,11 @@ struct BubbleDownVectorBitCastForExtract
     unsigned expandRatio =
         castDstType.getNumElements() / castSrcType.getNumElements();
 
-    auto getFirstIntValue = [](ArrayRef<OpFoldResult> values) -> uint64_t {
-      assert(values[0].is<Attribute>() && "Unexpected non-constant index");
-      return cast<IntegerAttr>(values[0].get<Attribute>()).getInt();
-    };
-
-    uint64_t index = getFirstIntValue(extractOp.getMixedPosition());
+    // Get the first element of the mixed position as integer.
+    auto mixedPos = extractOp.getMixedPosition();
+    if (mixedPos.size() > 0 && !mixedPos[0].is<Attribute>())
+      return failure();
+    uint64_t index = cast<IntegerAttr>(mixedPos[0].get<Attribute>()).getInt();
 
     // Get the single scalar (as a vector) in the source value that packs the
     // desired scalar. E.g. extract vector<1xf32> from vector<4xf32>
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 5bd3c370e3859..9d3c4366a7bd5 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -93,6 +93,33 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
         {} /* empty const strides*/);
 }
 
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<MemRefType> source,
+                           llvm::ArrayRef<OpFoldResult> offsets,
+                           llvm::ArrayRef<OpFoldResult> shape,
+                           llvm::ArrayRef<OpFoldResult> strides) {
+  assert(shape.size() && offsets.size() && strides.size() &&
+         shape.size() == strides.size() && shape.size() == offsets.size());
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<int64_t> staticShape;
+  llvm::SmallVector<int64_t> staticStrides;
+  llvm::SmallVector<Value> dynamicOffsets;
+  llvm::SmallVector<Value> dynamicShape;
+  llvm::SmallVector<Value> dynamicStrides;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  dispatchIndexOpFoldResults(shape, dynamicShape, staticShape);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+
+  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+  auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape);
+  auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides);
+
+  build(builder, state, tdesc, source, dynamicOffsets, dynamicShape,
+        dynamicStrides, staticOffsetsAttr, staticShapeAttr, staticStridesAttr);
+}
+
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
                            Type tdesc, TypedValue<IntegerType> source,
                            llvm::ArrayRef<OpFoldResult> offsets,
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index 25e9f80c9963c..e8e8f3cdfbfd7 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -91,11 +91,6 @@ IntegerType IntegerType::scaleElementBitwidth(unsigned scale) {
 //===----------------------------------------------------------------------===//
 
 unsigned FloatType::getWidth() {
-  // The actual width of TF32 is 19 bits. However, since it is a truncated
-  // version of Float32, we treat it as 32 bits in MLIR FloatType::getWidth
-  // for compatibility.
-  if (llvm::isa<FloatTF32Type>(*this))
-    return 32;
   return APFloat::semanticsSizeInBits(getFloatSemantics());
 }
 
diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index 0b085b10b2b33..2c1276d577a55 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -558,6 +558,20 @@ mlir::intrange::inferOr(ArrayRef<ConstantIntRanges> argRanges) {
 
 ConstantIntRanges
 mlir::intrange::inferXor(ArrayRef<ConstantIntRanges> argRanges) {
+  // TODO: The code below doesn't work for bitwidths > i1.
+  // For input ranges lhs=[2060639849, 2060639850], rhs=[2060639849, 2060639849]
+  // widenBitwiseBounds will produce:
+  // lhs:
+  // 2060639848  01111010110100101101111001101000
+  // 2060639851  01111010110100101101111001101011
+  // rhs:
+  // 2060639849  01111010110100101101111001101001
+  // 2060639849  01111010110100101101111001101001
+  // None of those combinations xor to 0, while intermediate values does.
+  unsigned width = argRanges[0].umin().getBitWidth();
+  if (width > 1)
+    return ConstantIntRanges::maxRange(width);
+
   auto [lhsZeros, lhsOnes] = widenBitwiseBounds(argRanges[0]);
   auto [rhsZeros, rhsOnes] = widenBitwiseBounds(argRanges[1]);
   auto xori = [](const APInt &a, const APInt &b) -> std::optional<APInt> {
diff --git a/mlir/lib/Target/LLVM/ModuleToObject.cpp b/mlir/lib/Target/LLVM/ModuleToObject.cpp
index 77391341adaad..3f5b3d5e31864 100644
--- a/mlir/lib/Target/LLVM/ModuleToObject.cpp
+++ b/mlir/lib/Target/LLVM/ModuleToObject.cpp
@@ -34,10 +34,17 @@
 using namespace mlir;
 using namespace mlir::LLVM;
 
-ModuleToObject::ModuleToObject(Operation &module, StringRef triple,
-                               StringRef chip, StringRef features, int optLevel)
+ModuleToObject::ModuleToObject(
+    Operation &module, StringRef triple, StringRef chip, StringRef features,
+    int optLevel, function_ref<void(llvm::Module &)> initialLlvmIRCallback,
+    function_ref<void(llvm::Module &)> linkedLlvmIRCallback,
+    function_ref<void(llvm::Module &)> optimizedLlvmIRCallback,
+    function_ref<void(StringRef)> isaCallback)
     : module(module), triple(triple), chip(chip), features(features),
-      optLevel(optLevel) {}
+      optLevel(optLevel), initialLlvmIRCallback(initialLlvmIRCallback),
+      linkedLlvmIRCallback(linkedLlvmIRCallback),
+      optimizedLlvmIRCallback(optimizedLlvmIRCallback),
+      isaCallback(isaCallback) {}
 
 ModuleToObject::~ModuleToObject() = default;
 
@@ -215,6 +222,9 @@ std::optional<SmallVector<char, 0>> ModuleToObject::run() {
   }
   setDataLayoutAndTriple(*llvmModule);
 
+  if (initialLlvmIRCallback)
+    initialLlvmIRCallback(*llvmModule);
+
   // Link bitcode files.
   handleModulePreLink(*llvmModule);
   {
@@ -227,10 +237,16 @@ std::optional<SmallVector<char, 0>> ModuleToObject::run() {
     handleModulePostLink(*llvmModule);
   }
 
+  if (linkedLlvmIRCallback)
+    linkedLlvmIRCallback(*llvmModule);
+
   // Optimize the module.
   if (failed(optimizeModule(*llvmModule, optLevel)))
     return std::nullopt;
 
+  if (optimizedLlvmIRCallback)
+    optimizedLlvmIRCallback(*llvmModule);
+
   // Return the serialized object.
   return moduleToObject(*llvmModule);
 }
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 69602af8563aa..bca26e3a0e84a 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -86,7 +86,11 @@ SerializeGPUModuleBase::SerializeGPUModuleBase(
     Operation &module, NVVMTargetAttr target,
     const gpu::TargetOptions &targetOptions)
     : ModuleToObject(module, target.getTriple(), target.getChip(),
-                     target.getFeatures(), target.getO()),
+                     target.getFeatures(), target.getO(),
+                     targetOptions.getInitialLlvmIRCallback(),
+                     targetOptions.getLinkedLlvmIRCallback(),
+                     targetOptions.getOptimizedLlvmIRCallback(),
+                     targetOptions.getISACallback()),
       target(target), toolkitPath(targetOptions.getToolkitPath()),
       fileList(targetOptions.getLinkFiles()) {
 
@@ -572,6 +576,9 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
     getOperation().emitError() << "Failed translating the module to ISA.";
     return std::nullopt;
   }
+  if (isaCallback)
+    isaCallback(serializedISA.value());
+
 #define DEBUG_TYPE "serialize-to-isa"
   LLVM_DEBUG({
     llvm::dbgs() << "PTX for module: " << getOperation().getNameAttr() << "\n";
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 9cc66207660f6..cf58bc5d8f475 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -214,6 +214,20 @@ class NVVMDialectLLVMIRTranslationInterface
         generateMetadata(values[1], NVVM::NVVMDialect::getReqntidYName());
       if (values.size() > 2)
         generateMetadata(values[2], NVVM::NVVMDialect::getReqntidZName());
+    } else if (attribute.getName() ==
+               NVVM::NVVMDialect::getClusterDimAttrName()) {
+      if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue()))
+        return failure();
+      auto values = cast<DenseI32ArrayAttr>(attribute.getValue());
+      generateMetadata(values[0], NVVM::NVVMDialect::getClusterDimXName());
+      if (values.size() > 1)
+        generateMetadata(values[1], NVVM::NVVMDialect::getClusterDimYName());
+      if (values.size() > 2)
+        generateMetadata(values[2], NVVM::NVVMDialect::getClusterDimZName());
+    } else if (attribute.getName() ==
+               NVVM::NVVMDialect::getClusterMaxBlocksAttrName()) {
+      auto value = dyn_cast<IntegerAttr>(attribute.getValue());
+      generateMetadata(value.getInt(), "cluster_max_blocks");
     } else if (attribute.getName() ==
                NVVM::NVVMDialect::getMinctasmAttrName()) {
       auto value = dyn_cast<IntegerAttr>(attribute.getValue());
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 5b2cfd370900a..42fe5b925654a 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -101,11 +101,6 @@ struct ConversionValueMapping {
     mapping.map(oldVal, newVal);
   }
 
-  /// Try to map a value to the one provided. Returns false if a transitive
-  /// mapping from the new value to the old value already exists, true if the
-  /// map was updated.
-  bool tryMap(Value oldVal, Value newVal);
-
   /// Drop the last mapping for the given value.
   void erase(Value value) { mapping.erase(value); }
 
@@ -149,14 +144,6 @@ Value ConversionValueMapping::lookupOrNull(Value from, Type desiredType) const {
   return result;
 }
 
-bool ConversionValueMapping::tryMap(Value oldVal, Value newVal) {
-  for (Value it = newVal; it; it = mapping.lookupOrNull(it))
-    if (it == oldVal)
-      return false;
-  map(oldVal, newVal);
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 // Rewriter and Translation State
 //===----------------------------------------------------------------------===//
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi b/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi
index a10bc693ba600..47168d49c5568 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/dialects/quant.pyi
@@ -101,7 +101,7 @@ class UniformQuantizedPerAxisType(QuantizedType):
   def scales(self) -> list[float]: ...
 
   @property
-  def zero_points(self) -> list[float]: ...
+  def zero_points(self) -> list[int]: ...
 
   @property
   def quantized_dimension(self) -> int: ...
diff --git a/mlir/python/mlir/extras/types.py b/mlir/python/mlir/extras/types.py
index 34eee1edb57ff..b875d639e9d40 100644
--- a/mlir/python/mlir/extras/types.py
+++ b/mlir/python/mlir/extras/types.py
@@ -21,6 +21,7 @@
     Float8E4M3Type,
     Float8E5M2Type,
     Float8E8M0FNUType,
+    FloatTF32Type,
     FunctionType,
     IndexType,
     IntegerType,
@@ -70,6 +71,7 @@ def ui(width):
 
 f16 = lambda: F16Type.get()
 f32 = lambda: F32Type.get()
+tf32 = lambda: FloatTF32Type.get()
 f64 = lambda: F64Type.get()
 bf16 = lambda: BF16Type.get()
 
diff --git a/mlir/test/CAPI/execution_engine.c b/mlir/test/CAPI/execution_engine.c
index 18120c6ec8028..4751288c3ee4b 100644
--- a/mlir/test/CAPI/execution_engine.c
+++ b/mlir/test/CAPI/execution_engine.c
@@ -55,7 +55,11 @@ void testSimpleExecution(void) {
       ctx, mlirStringRefCreateFromCString(
                // clang-format off
 "module {                                                                    \n"
+#ifdef __s390__
+"  func.func @add(%arg0 : i32) -> (i32 {llvm.signext}) attributes { llvm.emit_c_interface } {     \n"
+#else
 "  func.func @add(%arg0 : i32) -> i32 attributes { llvm.emit_c_interface } {     \n"
+#endif
 "    %res = arith.addi %arg0, %arg0 : i32                                        \n"
 "    return %res : i32                                                           \n"
 "  }                                                                             \n"
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 0d3e9f4ea2bf3..37a0c0067f7d3 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -659,7 +659,7 @@ gpu.module @test_module {
 
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_shuffle()
-  func.func @gpu_shuffle() -> (f32, f32) {
+  func.func @gpu_shuffle() -> (f32, f32, f32) {
     // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
     %arg0 = arith.constant 1.0 : f32
     // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
@@ -693,7 +693,21 @@ gpu.module @test_module {
     // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
     // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
     %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
-    func.return %shfl, %shfli : f32, f32
+    // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
+    // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32
+    // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
+    // CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32
+    // CHECK: %[[#DOWN:]] = llvm.add %[[#LANE_ID]], %{{.*}} : i32
+    // CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#DOWN]], %[[#WARP_OR_ZERO]] : i32
+    // CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#DOWN]], %{{.*}} : i1, i32
+    // CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32
+    // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+    // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+    %shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
+    func.return %shfl, %shfli, %shfld : f32, f32, f32
   }
 }
 
diff --git a/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir b/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
index 13eb3a194df44..665d0a33abedc 100644
--- a/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
+++ b/mlir/test/Conversion/GPUToVulkan/lower-gpu-launch-vulkan-launch.mlir
@@ -11,7 +11,7 @@ module attributes {gpu.container_module} {
       %0 = spirv.mlir.addressof @kernel_arg_0 : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>
       %2 = spirv.Constant 0 : i32
       %3 = spirv.mlir.addressof @kernel_arg_0 : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>
-      %4 = spirv.AccessChain %0[%2, %2] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
+      %4 = spirv.AccessChain %0[%2, %2] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
       %5 = spirv.Load "StorageBuffer" %4 : f32
       spirv.Return
     }
diff --git a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
index 1847975b279af..803722294b316 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
@@ -11,7 +11,7 @@ spirv.func @access_chain() "None" {
   %1 = spirv.Variable : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>
   // CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: llvm.getelementptr %{{.*}}[%[[ZERO]], 1, %[[ONE]]] : (!llvm.ptr, i32, i32) -> !llvm.ptr, !llvm.struct<packed (f32, array<4 x f32>)>
-  %2 = spirv.AccessChain %1[%0, %0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32
+  %2 = spirv.AccessChain %1[%0, %0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32 -> !spirv.ptr<f32, Function>
   spirv.Return
 }
 
@@ -20,7 +20,7 @@ spirv.func @access_chain_array(%arg0 : i32) "None" {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
   // CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: llvm.getelementptr %{{.*}}[%[[ZERO]], %{{.*}}] : (!llvm.ptr, i32, i32) -> !llvm.ptr, !llvm.array<4 x array<4 x f32>>
-  %1 = spirv.AccessChain %0[%arg0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32
+  %1 = spirv.AccessChain %0[%arg0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32 -> !spirv.ptr<!spirv.array<4xf32>, Function>
   %2 = spirv.Load "Function" %1 ["Volatile"] : !spirv.array<4xf32>
   spirv.Return
 }
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 03bcb341efea2..529dd4094507f 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2868,12 +2868,12 @@ func.func @flat_transpose_index(%arg0: vector<16xindex>) -> vector<16xindex> {
 
 // -----
 
-func.func @vector_load_op(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<8xf32> {
+func.func @vector_load(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<8xf32> {
   %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<8xf32>
   return %0 : vector<8xf32>
 }
 
-// CHECK-LABEL: func @vector_load_op
+// CHECK-LABEL: func @vector_load
 // CHECK: %[[c100:.*]] = llvm.mlir.constant(100 : index) : i64
 // CHECK: %[[mul:.*]] = llvm.mul %{{.*}}, %[[c100]]  : i64
 // CHECK: %[[add:.*]] = llvm.add %[[mul]], %{{.*}}  : i64
@@ -2882,12 +2882,26 @@ func.func @vector_load_op(%memref : memref<200x100xf32>, %i : index, %j : index)
 
 // -----
 
-func.func @vector_load_op_nontemporal(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<8xf32> {
+func.func @vector_load_scalable(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<[8]xf32> {
+  %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
+// CHECK-LABEL: func @vector_load_scalable
+// CHECK: %[[c100:.*]] = llvm.mlir.constant(100 : index) : i64
+// CHECK: %[[mul:.*]] = llvm.mul %{{.*}}, %[[c100]]  : i64
+// CHECK: %[[add:.*]] = llvm.add %[[mul]], %{{.*}}  : i64
+// CHECK: %[[gep:.*]] = llvm.getelementptr %{{.*}}[%[[add]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+// CHECK: llvm.load %[[gep]] {alignment = 4 : i64} : !llvm.ptr -> vector<[8]xf32>
+
+// -----
+
+func.func @vector_load_nontemporal(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<8xf32> {
   %0 = vector.load %memref[%i, %j] {nontemporal = true} : memref<200x100xf32>, vector<8xf32>
   return %0 : vector<8xf32>
 }
 
-// CHECK-LABEL: func @vector_load_op_nontemporal
+// CHECK-LABEL: func @vector_load_nontemporal
 // CHECK: %[[c100:.*]] = llvm.mlir.constant(100 : index) : i64
 // CHECK: %[[mul:.*]] = llvm.mul %{{.*}}, %[[c100]]  : i64
 // CHECK: %[[add:.*]] = llvm.add %[[mul]], %{{.*}}  : i64
@@ -2896,24 +2910,65 @@ func.func @vector_load_op_nontemporal(%memref : memref<200x100xf32>, %i : index,
 
 // -----
 
-func.func @vector_load_op_index(%memref : memref<200x100xindex>, %i : index, %j : index) -> vector<8xindex> {
+func.func @vector_load_nontemporal_scalable(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<[8]xf32> {
+  %0 = vector.load %memref[%i, %j] {nontemporal = true} : memref<200x100xf32>, vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
+// CHECK-LABEL: func @vector_load_nontemporal_scalable
+// CHECK: %[[c100:.*]] = llvm.mlir.constant(100 : index) : i64
+// CHECK: %[[mul:.*]] = llvm.mul %{{.*}}, %[[c100]]  : i64
+// CHECK: %[[add:.*]] = llvm.add %[[mul]], %{{.*}}  : i64
+// CHECK: %[[gep:.*]] = llvm.getelementptr %{{.*}}[%[[add]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+// CHECK: llvm.load %[[gep]] {alignment = 4 : i64, nontemporal} : !llvm.ptr -> vector<[8]xf32>
+
+// -----
+
+func.func @vector_load_index(%memref : memref<200x100xindex>, %i : index, %j : index) -> vector<8xindex> {
   %0 = vector.load %memref[%i, %j] : memref<200x100xindex>, vector<8xindex>
   return %0 : vector<8xindex>
 }
-// CHECK-LABEL: func @vector_load_op_index
+// CHECK-LABEL: func @vector_load_index
 // CHECK: %[[T0:.*]] = llvm.load %{{.*}} {alignment = 8 : i64} : !llvm.ptr -> vector<8xi64>
 // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<8xi64> to vector<8xindex>
 // CHECK: return %[[T1]] : vector<8xindex>
 
 // -----
 
-func.func @vector_store_op(%memref : memref<200x100xf32>, %i : index, %j : index) {
+func.func @vector_load_index_scalable(%memref : memref<200x100xindex>, %i : index, %j : index) -> vector<[8]xindex> {
+  %0 = vector.load %memref[%i, %j] : memref<200x100xindex>, vector<[8]xindex>
+  return %0 : vector<[8]xindex>
+}
+// CHECK-LABEL: func @vector_load_index_scalable
+// CHECK: %[[T0:.*]] = llvm.load %{{.*}} {alignment = 8 : i64} : !llvm.ptr -> vector<[8]xi64>
+// CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<[8]xi64> to vector<[8]xindex>
+// CHECK: return %[[T1]] : vector<[8]xindex>
+
+// -----
+
+func.func @vector_load_0d(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<f32> {
+  %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  return %0 : vector<f32>
+}
+
+// CHECK-LABEL: func @vector_load_0d
+// CHECK: %[[load:.*]] = memref.load %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK: %[[vec:.*]] = llvm.mlir.undef : vector<1xf32>
+// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[inserted:.*]] = llvm.insertelement %[[load]], %[[vec]][%[[c0]] : i32] : vector<1xf32>
+// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[inserted]] : vector<1xf32> to vector<f32>
+// CHECK: return %[[cast]] : vector<f32>
+
+// -----
+
+
+func.func @vector_store(%memref : memref<200x100xf32>, %i : index, %j : index) {
   %val = arith.constant dense<11.0> : vector<4xf32>
   vector.store %val, %memref[%i, %j] : memref<200x100xf32>, vector<4xf32>
   return
 }
 
-// CHECK-LABEL: func @vector_store_op
+// CHECK-LABEL: func @vector_store
 // CHECK: %[[c100:.*]] = llvm.mlir.constant(100 : index) : i64
 // CHECK: %[[mul:.*]] = llvm.mul %{{.*}}, %[[c100]]  : i64
 // CHECK: %[[add:.*]] = llvm.add %[[mul]], %{{.*}}  : i64
@@ -2922,13 +2977,28 @@ func.func @vector_store_op(%memref : memref<200x100xf32>, %i : index, %j : index
 
 // -----
 
-func.func @vector_store_op_nontemporal(%memref : memref<200x100xf32>, %i : index, %j : index) {
+func.func @vector_store_scalable(%memref : memref<200x100xf32>, %i : index, %j : index) {
+  %val = arith.constant dense<11.0> : vector<[4]xf32>
+  vector.store %val, %memref[%i, %j] : memref<200x100xf32>, vector<[4]xf32>
+  return
+}
+
+// CHECK-LABEL: func @vector_store_scalable
+// CHECK: %[[c100:.*]] = llvm.mlir.constant(100 : index) : i64
+// CHECK: %[[mul:.*]] = llvm.mul %{{.*}}, %[[c100]]  : i64
+// CHECK: %[[add:.*]] = llvm.add %[[mul]], %{{.*}}  : i64
+// CHECK: %[[gep:.*]] = llvm.getelementptr %{{.*}}[%[[add]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+// CHECK: llvm.store %{{.*}}, %[[gep]] {alignment = 4 : i64} :  vector<[4]xf32>, !llvm.ptr
+
+// -----
+
+func.func @vector_store_nontemporal(%memref : memref<200x100xf32>, %i : index, %j : index) {
   %val = arith.constant dense<11.0> : vector<4xf32>
   vector.store %val, %memref[%i, %j] {nontemporal = true} : memref<200x100xf32>, vector<4xf32>
   return
 }
 
-// CHECK-LABEL: func @vector_store_op_nontemporal
+// CHECK-LABEL: func @vector_store_nontemporal
 // CHECK: %[[c100:.*]] = llvm.mlir.constant(100 : index) : i64
 // CHECK: %[[mul:.*]] = llvm.mul %{{.*}}, %[[c100]]  : i64
 // CHECK: %[[add:.*]] = llvm.add %[[mul]], %{{.*}}  : i64
@@ -2937,28 +3007,38 @@ func.func @vector_store_op_nontemporal(%memref : memref<200x100xf32>, %i : index
 
 // -----
 
-func.func @vector_store_op_index(%memref : memref<200x100xindex>, %i : index, %j : index) {
+func.func @vector_store_nontemporal_scalable(%memref : memref<200x100xf32>, %i : index, %j : index) {
+  %val = arith.constant dense<11.0> : vector<[4]xf32>
+  vector.store %val, %memref[%i, %j] {nontemporal = true} : memref<200x100xf32>, vector<[4]xf32>
+  return
+}
+
+// CHECK-LABEL: func @vector_store_nontemporal_scalable
+// CHECK: %[[c100:.*]] = llvm.mlir.constant(100 : index) : i64
+// CHECK: %[[mul:.*]] = llvm.mul %{{.*}}, %[[c100]]  : i64
+// CHECK: %[[add:.*]] = llvm.add %[[mul]], %{{.*}}  : i64
+// CHECK: %[[gep:.*]] = llvm.getelementptr %{{.*}}[%[[add]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+// CHECK: llvm.store %{{.*}}, %[[gep]] {alignment = 4 : i64, nontemporal} :  vector<[4]xf32>, !llvm.ptr
+
+// -----
+
+func.func @vector_store_index(%memref : memref<200x100xindex>, %i : index, %j : index) {
   %val = arith.constant dense<11> : vector<4xindex>
   vector.store %val, %memref[%i, %j] : memref<200x100xindex>, vector<4xindex>
   return
 }
-// CHECK-LABEL: func @vector_store_op_index
+// CHECK-LABEL: func @vector_store_index
 // CHECK: llvm.store %{{.*}}, %{{.*}} {alignment = 8 : i64} : vector<4xi64>, !llvm.ptr
 
 // -----
 
-func.func @vector_load_op_0d(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<f32> {
-  %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<f32>
-  return %0 : vector<f32>
+func.func @vector_store_index_scalable(%memref : memref<200x100xindex>, %i : index, %j : index) {
+  %val = arith.constant dense<11> : vector<[4]xindex>
+  vector.store %val, %memref[%i, %j] : memref<200x100xindex>, vector<[4]xindex>
+  return
 }
-
-// CHECK-LABEL: func @vector_load_op_0d
-// CHECK: %[[load:.*]] = memref.load %{{.*}}[%{{.*}}, %{{.*}}]
-// CHECK: %[[vec:.*]] = llvm.mlir.undef : vector<1xf32>
-// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK: %[[inserted:.*]] = llvm.insertelement %[[load]], %[[vec]][%[[c0]] : i32] : vector<1xf32>
-// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[inserted]] : vector<1xf32> to vector<f32>
-// CHECK: return %[[cast]] : vector<f32>
+// CHECK-LABEL: func @vector_store_index_scalable
+// CHECK: llvm.store %{{.*}}, %{{.*}} {alignment = 8 : i64} : vector<[4]xi64>, !llvm.ptr
 
 // -----
 
diff --git a/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir b/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir
index ded1687ca560b..650555cfb5fe1 100644
--- a/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir
+++ b/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir
@@ -35,10 +35,10 @@ func.func @dynamic_basis(%linear_index: index, %src: memref<?x?x?xf32>) -> (inde
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %b0 = memref.dim %src, %c0 : memref<?x?x?xf32>
   %b1 = memref.dim %src, %c1 : memref<?x?x?xf32>
   %b2 = memref.dim %src, %c2 : memref<?x?x?xf32>
-  %1:3 = affine.delinearize_index %linear_index into (%b0, %b1, %b2) : index, index, index
+  // Note: no outer bound.
+  %1:3 = affine.delinearize_index %linear_index into (%b1, %b2) : index, index, index
   return %1#0, %1#1, %1#2 : index, index, index
 }
 
@@ -60,10 +60,11 @@ func.func @linearize_static(%arg0: index, %arg1: index, %arg2: index) -> index {
 // CHECK-DAG: #[[$map0:.+]] =  affine_map<()[s0, s1, s2, s3, s4] -> (s1 * s2 + s3 + s0 * (s2 * s4))>
 
 // CHECK-LABEL: @linearize_dynamic
-// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index, %[[arg4:.+]]: index, %[[arg5:.+]]: index)
-// CHECK: %[[val_0:.+]] = affine.apply #[[$map0]]()[%[[arg0]], %[[arg1]], %[[arg5]], %[[arg2]], %[[arg4]]]
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index, %[[arg4:.+]]: index)
+// CHECK: %[[val_0:.+]] = affine.apply #[[$map0]]()[%[[arg0]], %[[arg1]], %[[arg4]], %[[arg2]], %[[arg3]]]
 // CHECK: return %[[val_0]]
-func.func @linearize_dynamic(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> index {
-  %0 = affine.linearize_index [%arg0, %arg1, %arg2] by (%arg3, %arg4, %arg5) : index
+func.func @linearize_dynamic(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> index {
+  // Note: no outer bounds
+  %0 = affine.linearize_index [%arg0, %arg1, %arg2] by (%arg3, %arg4) : index
   func.return %0 : index
 }
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
index ec00b31258d07..b54a13cffe777 100644
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -1496,6 +1496,20 @@ func.func @delinearize_fold_negative_constant() -> (index, index, index) {
 
 // -----
 
+// CHECK-LABEL: @delinearize_fold_negative_constant_no_outer_bound
+// CHECK-DAG: %[[C_2:.+]] = arith.constant -2 : index
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
+// CHECK-NOT: affine.delinearize_index
+// CHECK: return %[[C_2]], %[[C1]], %[[C3]]
+func.func @delinearize_fold_negative_constant_no_outer_bound() -> (index, index, index) {
+  %c_22 = arith.constant -22 : index
+  %0:3 = affine.delinearize_index %c_22 into (3, 5) : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+
+// -----
+
 // CHECK-LABEL: @delinearize_dont_fold_constant_dynamic_basis
 // CHECK-DAG: %[[C22:.+]] = arith.constant 22 : index
 // CHECK: %[[RET:.+]]:3 = affine.delinearize_index %[[C22]]
@@ -1525,6 +1539,23 @@ func.func @drop_unit_basis_in_delinearize(%arg0 : index, %arg1 : index, %arg2 :
 
 // -----
 
+func.func @drop_unit_basis_in_delinearize_no_outer_bound(%arg0 : index, %arg1 : index, %arg2 : index) ->
+    (index, index, index, index, index, index) {
+  %c1 = arith.constant 1 : index
+  %0:6 = affine.delinearize_index %arg0 into (%arg1, 1, 1, %arg2, %c1)
+      : index, index, index, index, index, index
+  return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : index, index, index, index, index, index
+}
+// CHECK-LABEL: func @drop_unit_basis_in_delinearize_no_outer_bound(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[DELINEARIZE:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], %[[ARG2]])
+//       CHECK:   return %[[DELINEARIZE]]#0, %[[DELINEARIZE]]#1, %[[C0]], %[[C0]], %[[DELINEARIZE]]#2, %[[C0]]
+
+// -----
+
 func.func @drop_all_unit_bases(%arg0 : index) -> (index, index) {
   %0:2 = affine.delinearize_index %arg0 into (1, 1) : index, index
   return %0#0, %0#1 : index, index
@@ -1537,6 +1568,18 @@ func.func @drop_all_unit_bases(%arg0 : index) -> (index, index) {
 
 // -----
 
+func.func @drop_all_unit_bases_no_outer_bound(%arg0 : index) -> (index, index, index) {
+  %0:3 = affine.delinearize_index %arg0 into (1, 1) : index, index, index
+  return %0#0, %0#1, %0#2 : index, index, index
+}
+// CHECK-LABEL: func @drop_all_unit_bases_no_outer_bound(
+//  CHECK-SAME:     %[[ARG0:.+]]: index)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-NOT:   affine.delinearize_index
+//       CHECK:   return %[[ARG0]], %[[C0]], %[[C0]]
+
+// -----
+
 func.func @drop_single_loop_delinearize(%arg0 : index, %arg1 : index) -> index {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -1574,6 +1617,17 @@ func.func @delinearize_non_loop_like(%arg0: memref<?xi32>, %i : index) -> index
 
 // -----
 
+// CHECK-LABEL: func @delinearize_empty_basis
+// CHECK-SAME: (%[[ARG0:.+]]: index)
+// CHECK-NOT: affine.delinearize
+// CHECK: return %[[ARG0]]
+func.func @delinearize_empty_basis(%arg0: index) -> index {
+  %0 = affine.delinearize_index %arg0 into () : index
+  return %0 : index
+}
+
+// -----
+
 // CHECK-LABEL: @linearize_fold_constants
 // CHECK-DAG: %[[C22:.+]] = arith.constant 22 : index
 // CHECK-NOT: affine.linearize
@@ -1588,6 +1642,42 @@ func.func @linearize_fold_constants() -> index {
 
 // -----
 
+// CHECK-LABEL: @linearize_fold_constants_no_outer_bound
+// CHECK-DAG: %[[C22:.+]] = arith.constant 22 : index
+// CHECK-NOT: affine.linearize
+// CHECK: return %[[C22]]
+func.func @linearize_fold_constants_no_outer_bound() -> index {
+  %c2 = arith.constant 2 : index
+  %c1 = arith.constant 1 : index
+
+  %ret = affine.linearize_index [%c1, %c1, %c2] by (3, 5) : index
+  return %ret : index
+}
+
+// -----
+
+// CHECK-LABEL: @linearize_fold_empty_basis
+// CHECK-SAME: (%[[ARG0:.+]]: index)
+// CHECK-NOT: affine.linearize
+// CHECK: return %[[ARG0]]
+func.func @linearize_fold_empty_basis(%arg0: index) -> index {
+  %ret = affine.linearize_index [%arg0] by () : index
+  return %ret : index
+}
+
+// -----
+
+// CHECK-LABEL: @linearize_fold_only_outer_bound
+// CHECK-SAME: (%[[ARG0:.+]]: index)
+// CHECK-NOT: affine.linearize
+// CHECK: return %[[ARG0]]
+func.func @linearize_fold_only_outer_bound(%arg0: index) -> index {
+  %ret = affine.linearize_index [%arg0] by (2) : index
+  return %ret : index
+}
+
+// -----
+
 // CHECK-LABEL: @linearize_dont_fold_dynamic_basis
 // CHECK: %[[RET:.+]] = affine.linearize_index
 // CHECK: return %[[RET]]
@@ -1617,6 +1707,38 @@ func.func @cancel_delinearize_linearize_disjoint_exact(%arg0: index, %arg1: inde
 
 // -----
 
+// CHECK-LABEL: func @cancel_delinearize_linearize_disjoint_linearize_extra_bound(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index)
+//       CHECK:     return %[[ARG0]], %[[ARG1]], %[[ARG2]]
+func.func @cancel_delinearize_linearize_disjoint_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) {
+  %0 = affine.linearize_index disjoint [%arg0, %arg1, %arg2] by (4, %arg4) : index
+  %1:3 = affine.delinearize_index %0 into (4, %arg4)
+      : index, index, index
+  return %1#0, %1#1, %1#2 : index, index, index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_delinearize_linearize_disjoint_delinearize_extra_bound(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index)
+//       CHECK:     return %[[ARG0]], %[[ARG1]], %[[ARG2]]
+func.func @cancel_delinearize_linearize_disjoint_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> (index, index, index) {
+  %0 = affine.linearize_index disjoint [%arg0, %arg1, %arg2] by (4, %arg4) : index
+  %1:3 = affine.delinearize_index %0 into (%arg3, 4, %arg4)
+      : index, index, index
+  return %1#0, %1#1, %1#2 : index, index, index
+}
+
+// -----
+
 // Without `disjoint`, the cancelation isn't guaranteed to be the identity.
 // CHECK-LABEL: func @no_cancel_delinearize_linearize_exact(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
@@ -1666,6 +1788,17 @@ func.func @linearize_unit_basis_disjoint(%arg0: index, %arg1: index, %arg2: inde
 
 // -----
 
+// CHECK-LABEL: @linearize_unit_basis_disjoint_no_outer_bound
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index)
+// CHECK: %[[ret:.+]] = affine.linearize_index disjoint [%[[arg0]], %[[arg2]]] by (%[[arg3]]) : index
+// CHECK: return %[[ret]]
+func.func @linearize_unit_basis_disjoint_no_outer_bound(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index {
+  %ret = affine.linearize_index disjoint [%arg0, %arg1, %arg2] by (1, %arg3) : index
+  return %ret : index
+}
+
+// -----
+
 // CHECK-LABEL: @linearize_unit_basis_zero
 // CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index)
 // CHECK: %[[ret:.+]] = affine.linearize_index [%[[arg0]], %[[arg1]]] by (3, %[[arg2]]) : index
@@ -1713,6 +1846,32 @@ func.func @cancel_linearize_denearize_exact(%arg0: index, %arg1: index, %arg2: i
 
 // -----
 
+// CHECK-LABEL: func @cancel_linearize_denearize_linearize_extra_bound(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
+//       CHECK:     return %[[ARG0]]
+func.func @cancel_linearize_denearize_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (4, %arg2) : index, index, index
+  %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 4, %arg2) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_denearize_delinearize_extra_bound(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
+//       CHECK:     return %[[ARG0]]
+func.func @cancel_linearize_denearize_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index
+  %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (4, %arg2) : index
+  return %1 : index
+}
+
+// -----
+
 // Don't cancel because the values from the delinearize aren't used in order
 // CHECK-LABEL: func @no_cancel_linearize_denearize_permuted(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
@@ -1756,3 +1915,16 @@ func.func @affine_leading_zero(%arg0: index, %arg1: index) -> index {
   return %ret : index
 }
 
+// -----
+
+// CHECK-LABEL: func @affine_leading_zero_no_outer_bound(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[RET:.+]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (3, 5)
+//       CHECK:     return %[[RET]]
+func.func @affine_leading_zero_no_outer_bound(%arg0: index, %arg1: index) -> index {
+  %c0 = arith.constant 0 : index
+  %ret = affine.linearize_index [%c0, %arg0, %arg1] by (3, 5) : index
+  return %ret : index
+}
+
diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir
index f493f1b81ecc3..2694649576464 100644
--- a/mlir/test/Dialect/Affine/invalid.mlir
+++ b/mlir/test/Dialect/Affine/invalid.mlir
@@ -533,37 +533,29 @@ func.func @missing_for_min(%arg0: index, %arg1: index, %arg2: memref<100xf32>) {
 // -----
 
 func.func @delinearize(%idx: index, %basis0: index, %basis1 :index) {
-  // expected-error@+1 {{'affine.delinearize_index' op should return an index for each basis element}}
+  // expected-error@+1 {{'affine.delinearize_index' op should return an index for each basis element and up to one extra index}}
   %1 = affine.delinearize_index %idx into (%basis0, %basis1) : index
   return
 }
 
 // -----
 
-func.func @delinearize(%idx: index, %basis0: index, %basis1 :index) {
-  // expected-error@+1 {{'affine.delinearize_index' op basis should not be empty}}
-  affine.delinearize_index %idx into () : index
+func.func @delinearize(%idx: index) {
+  // expected-error@+1 {{'affine.delinearize_index' op no basis element may be statically non-positive}}
+  %1:2 = affine.delinearize_index %idx into (2, -2) : index, index
   return
 }
 
 // -----
 
 func.func @linearize(%idx: index, %basis0: index, %basis1 :index) -> index {
-  // expected-error@+1 {{'affine.linearize_index' op should be passed an index for each basis element}}
+  // expected-error@+1 {{'affine.linearize_index' op should be passed a basis element for each index except possibly the first}}
   %0 = affine.linearize_index [%idx] by (%basis0, %basis1) : index
   return %0 : index
 }
 
 // -----
 
-func.func @linearize_empty() -> index {
-  // expected-error@+1 {{'affine.linearize_index' op basis should not be empty}}
-  %0 = affine.linearize_index [] by () : index
-  return %0 : index
-}
-
-// -----
-
 func.func @dynamic_dimension_index() {
   "unknown.region"() ({
     %idx = "unknown.test"() : () -> (index)
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index afb0b4929bce7..4db846fa4656a 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -454,9 +454,35 @@ func.func @ori(%arg0 : i128, %arg1 : i128) -> i1 {
     func.return %2 : i1
 }
 
+// CHECK-LABEL: func @xori_issue_82168
+// arith.cmpi was erroneously folded to %false, see Issue #82168.
+// CHECK: %[[R:.*]] = arith.cmpi eq, %{{.*}}, %{{.*}} : i64
+// CHECK: return %[[R]]
+func.func @xori_issue_82168() -> i1 {
+    %c0_i64 = arith.constant 0 : i64
+    %c2060639849_i64 = arith.constant 2060639849 : i64
+    %2 = test.with_bounds { umin = 2060639849 : i64, umax = 2060639850 : i64, smin = 2060639849 : i64, smax = 2060639850 : i64 } : i64
+    %3 = arith.xori %2, %c2060639849_i64 : i64
+    %4 = arith.cmpi eq, %3, %c0_i64 : i64
+    func.return %4 : i1
+}
+
+// CHECK-LABEL: func @xori_i1
+//   CHECK-DAG: %[[true:.*]] = arith.constant true
+//   CHECK-DAG: %[[false:.*]] = arith.constant false
+//       CHECK: return %[[true]], %[[false]]
+func.func @xori_i1() -> (i1, i1) {
+    %true = arith.constant true
+    %1 = test.with_bounds { umin = 0 : i1, umax = 0 : i1, smin = 0 : i1, smax = 0 : i1 } : i1
+    %2 = test.with_bounds { umin = 1 : i1, umax = 1 : i1, smin = 1 : i1, smax = 1 : i1 } : i1
+    %3 = arith.xori %1, %true : i1
+    %4 = arith.xori %2, %true : i1
+    func.return %3, %4 : i1, i1
+}
+
 // CHECK-LABEL: func @xori
-// CHECK: %[[false:.*]] = arith.constant false
-// CHECK: return %[[false]]
+// TODO: xor folding is temporarily disabled
+// CHECK-NOT: arith.constant false
 func.func @xori(%arg0 : i64, %arg1 : i64) -> i1 {
     %c0 = arith.constant 0 : i64
     %c7 = arith.constant 7 : i64
diff --git a/mlir/test/Dialect/Func/func-bufferize.mlir b/mlir/test/Dialect/Func/func-bufferize.mlir
deleted file mode 100644
index 22986bbc76010..0000000000000
--- a/mlir/test/Dialect/Func/func-bufferize.mlir
+++ /dev/null
@@ -1,83 +0,0 @@
-// RUN: mlir-opt %s -func-bufferize -split-input-file -verify-diagnostics | FileCheck %s
-
-// CHECK-LABEL:   func @identity(
-// CHECK-SAME:                   %[[ARG:.*]]: memref<f32>) -> memref<f32> {
-// CHECK:           return %[[ARG]] : memref<f32>
-func.func @identity(%arg0: tensor<f32>) -> tensor<f32> {
-  return %arg0 : tensor<f32>
-}
-
-// CHECK-LABEL:   func @block_arguments(
-// CHECK-SAME:        %[[ARG:.*]]: memref<f32>) -> memref<f32> {
-// CHECK:           cf.br ^bb1(%[[ARG]] : memref<f32>)
-// CHECK:         ^bb1(%[[BBARG:.*]]: memref<f32>):
-// CHECK:           return %[[BBARG]] : memref<f32>
-func.func @block_arguments(%arg0: tensor<f32>) -> tensor<f32> {
-  cf.br ^bb1(%arg0: tensor<f32>)
-^bb1(%bbarg: tensor<f32>):
-  return %bbarg : tensor<f32>
-}
-
-// CHECK-LABEL:   func private @source() -> memref<f32>
-// CHECK-LABEL:   func @call_source() -> memref<f32> {
-// CHECK:           %[[RET:.*]] = call @source() : () -> memref<f32>
-// CHECK:           return %[[RET]] : memref<f32>
-func.func private @source() -> tensor<f32>
-func.func @call_source() -> tensor<f32> {
-  %0 = call @source() : () -> tensor<f32>
-  return %0 : tensor<f32>
-}
-// CHECK-LABEL:   func @call_sink(
-// CHECK-SAME:                    %[[ARG:.*]]: memref<f32>) {
-// CHECK:           call @sink(%[[ARG]]) : (memref<f32>) -> ()
-// CHECK:           return
-func.func private @sink(tensor<f32>)
-func.func @call_sink(%arg0: tensor<f32>) {
-  call @sink(%arg0) : (tensor<f32>) -> ()
-  return
-}
-
-// CHECK-LABEL:   func @unconverted_op_in_body() -> memref<f32> {
-// CHECK:           %[[TENSOR:.*]] = "test.source"() : () -> tensor<f32>
-// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<f32>
-// CHECK:           return %[[MEMREF]] : memref<f32>
-func.func @unconverted_op_in_body() -> tensor<f32> {
-  %0 = "test.source"() : () -> tensor<f32>
-  return %0 : tensor<f32>
-}
-
-// -----
-
-// Because this pass updates block arguments, it needs to also atomically
-// update all terminators and issue an error if that is not possible.
-func.func @unable_to_update_terminator(%arg0: tensor<f32>) -> tensor<f32> {
-    %0 = arith.constant true
-    cf.cond_br %0, ^bb1(%arg0: tensor<f32>), ^bb2(%arg0: tensor<f32>)
-  ^bb1(%bbarg0: tensor<f32>):
-    // expected-error @+1 {{failed to legalize operation 'test.terminator'}}
-    "test.terminator"() : () -> ()
-  ^bb2(%bbarg1: tensor<f32>):
-    return %bbarg1 : tensor<f32>
-}
-
-// -----
-
-// There was a bug in func-bufferize pass which caused terminators without
-// ReturnLike and BranchOpInterface traits (e.g. scf.condition) to always
-// fail to legalize even if bufferization doesn't needed.
-// Check the pass succedeed.
-// CHECK: bufferize_while
-// CHECK: scf.while
-// CHECK: scf.condition
-func.func @bufferize_while(%arg0: i64, %arg1: i64) -> i64 {
-  %c2_i64 = arith.constant 2 : i64
-  %0:2 = scf.while (%arg2 = %arg0) : (i64) -> (i64, i64) {
-    %1 = arith.cmpi slt, %arg2, %arg1 : i64
-    scf.condition(%1) %arg2, %arg2 : i64, i64
-  } do {
-  ^bb0(%arg2: i64, %arg3: i64):
-    %1 = arith.muli %arg3, %c2_i64 : i64
-    scf.yield %1 : i64
-  }
-  return %0#1 : i64
-}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 4afa839aa3ea1..92789246edb4f 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -388,6 +388,17 @@ llvm.func @rocdl.s.wait.dscnt() {
 
 // -----
 
+llvm.func @rocdl.readlane(%src : f32) -> f32 {
+  %cst0 = llvm.mlir.constant(0 : i32) : i32
+
+  // CHECK-LABEL: rocdl.readlane
+  // CHECK: rocdl.readlane %{{.*}} %{{.*}}
+  %ret = rocdl.readlane %src, %cst0 : (f32, i32) -> f32
+  llvm.return %ret : f32
+}
+
+// -----
+
 // expected-error@below {{attribute attached to unexpected op}}
 func.func private @expected_llvm_func() attributes { rocdl.kernel }
 
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir
similarity index 94%
rename from mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir
rename to mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir
index 1fae311467bcf..ec761d9a49436 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir
@@ -1,4 +1,7 @@
-// RUN: mlir-opt -split-input-file --transform-interpreter --canonicalize --test-linalg-transform-patterns="test-generalize-tensor-pack"  %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -transform-interpreter --canonicalize \
+// RUN: -transform-preload-library='transform-library-paths=%p/td/decompose-pack.mlir' \
+// RUN: -transform-interpreter=entry-point=decompose_pack \
+// RUN: -transform-interpreter  %s | FileCheck %s
 
 func.func @KCRS_to_KCRSsr(%arg0: tensor<1x1x128x64xf32>, %arg1: tensor<1x1x4x8x8x32xf32>) -> tensor<1x1x4x8x8x32xf32> {
   %0 = tensor.pack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x128x64xf32> -> tensor<1x1x4x8x8x32xf32>
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir
similarity index 98%
rename from mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir
rename to mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir
index ad20541e301d3..4f986606ef93a 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt  --transform-preload-library='transform-library-paths=%p/td/generalize-pack.mlir' -split-input-file  --transform-interpreter %s | FileCheck %s
+// RUN: mlir-opt -split-input-file \
+// RUN: -transform-preload-library='transform-library-paths=%p/td/decompose-pack.mlir' \
+// RUN: -transform-interpreter=entry-point=decompose_pack %s | FileCheck %s
 
 func.func @simple_KCRS_to_KCRSsr(%arg0: tensor<?x?xi32>, %arg1: tensor<1x1x?x1xi32>) -> tensor<1x1x?x1xi32> {
   %c8 = arith.constant 8 : index
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir
similarity index 98%
rename from mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir
rename to mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir
index c15859d898ec1..6d9709caf7093 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file --transform-interpreter --canonicalize --test-linalg-transform-patterns="test-generalize-tensor-unpack"  %s | FileCheck %s
+// RUN: mlir-opt -split-input-file --transform-interpreter --canonicalize --test-linalg-transform-patterns="test-decompose-tensor-unpack"  %s | FileCheck %s
 
 func.func @KCRSsr_to_KCRS(%arg0: tensor<1x1x4x8x8x32xf32>, %arg1: tensor<1x1x128x64xf32>) -> tensor<1x1x128x64xf32> {
   %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x4x8x8x32xf32> -> tensor<1x1x128x64xf32>
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir
similarity index 99%
rename from mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir
rename to mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir
index 153ce68b8f086..8b15873473a97 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file --test-linalg-transform-patterns="test-generalize-tensor-unpack"  %s | FileCheck %s
+// RUN: mlir-opt -split-input-file --test-linalg-transform-patterns="test-decompose-tensor-unpack"  %s | FileCheck %s
 
 func.func @simple_KCRSsr_to_KCRS(%arg0: tensor<1x1x1x1x8x32xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32> {
   %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x1x1x8x32xf32> -> tensor<1x1x32x8xf32>
diff --git a/mlir/test/Dialect/Linalg/td/generalize-pack.mlir b/mlir/test/Dialect/Linalg/td/decompose-pack.mlir
similarity index 70%
rename from mlir/test/Dialect/Linalg/td/generalize-pack.mlir
rename to mlir/test/Dialect/Linalg/td/decompose-pack.mlir
index 62e5b779ff361..49c45e29d5a14 100644
--- a/mlir/test/Dialect/Linalg/td/generalize-pack.mlir
+++ b/mlir/test/Dialect/Linalg/td/decompose-pack.mlir
@@ -1,10 +1,10 @@
 module @transforms attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+  transform.named_sequence @decompose_pack(%module: !transform.any_op {transform.readonly}) {
     %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op
 
     %1 = transform.get_parent_op %pack {isolated_from_above} : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %1 {
-      transform.apply_patterns.linalg.generalize_pack_unpack
+      transform.apply_patterns.linalg.decompose_pack_unpack
     } : !transform.any_op
 
     transform.yield
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index aa41eea44f3ef..2a19e4837f550 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -2620,6 +2620,44 @@ func.func @omp_loop_invalid_binding(%lb : index, %ub : index, %step : index) {
       omp.yield
     }
   }
+  return
+}
+
+// -----
+func.func @nested_wrapper(%idx : index) {
+  omp.workshare {
+    // expected-error @below {{cannot be composite}}
+    omp.workshare.loop_wrapper {
+      omp.simd {
+        omp.loop_nest (%iv) : index = (%idx) to (%idx) step (%idx) {
+          omp.yield
+        }
+      } {omp.composite}
+    }
+    omp.terminator
+  }
+  return
+}
+
+// -----
+func.func @not_wrapper() {
+  omp.workshare {
+    // expected-error @below {{op nested in loop wrapper is not another loop wrapper or `omp.loop_nest`}}
+    omp.workshare.loop_wrapper {
+      %0 = arith.constant 0 : index
+    }
+    omp.terminator
+  }
+  return
+}
 
+// -----
+func.func @missing_workshare(%idx : index) {
+  // expected-error @below {{must be nested in an omp.workshare}}
+  omp.workshare.loop_wrapper {
+    omp.loop_nest (%iv) : index = (%idx) to (%idx) step (%idx) {
+      omp.yield
+    }
+  }
   return
 }
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 69c53d1f77e84..c25a6ef4b4849 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -2789,3 +2789,70 @@ func.func @omp_loop(%lb : index, %ub : index, %step : index) {
 
   return
 }
+
+// CHECK-LABEL: func @omp_workshare
+func.func @omp_workshare() {
+  // CHECK: omp.workshare {
+  omp.workshare {
+    "test.payload"() : () -> ()
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func @omp_workshare_nowait
+func.func @omp_workshare_nowait() {
+  // CHECK: omp.workshare nowait {
+  omp.workshare nowait {
+    "test.payload"() : () -> ()
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func @omp_workshare_multiple_blocks
+func.func @omp_workshare_multiple_blocks() {
+  // CHECK: omp.workshare {
+  omp.workshare {
+    cf.br ^bb2
+    ^bb2:
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func @omp_workshare_loop_wrapper
+func.func @omp_workshare_loop_wrapper(%idx : index) {
+  // CHECK-NEXT: omp.workshare {
+  omp.workshare {
+    // CHECK-NEXT: omp.workshare.loop_wrapper
+    omp.workshare.loop_wrapper {
+      // CHECK-NEXT: omp.loop_nest
+      omp.loop_nest (%iv) : index = (%idx) to (%idx) step (%idx) {
+        omp.yield
+      }
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func @omp_workshare_loop_wrapper_attrs
+func.func @omp_workshare_loop_wrapper_attrs(%idx : index) {
+  // CHECK-NEXT: omp.workshare {
+  omp.workshare {
+    // CHECK-NEXT: omp.workshare.loop_wrapper {
+    omp.workshare.loop_wrapper {
+      // CHECK-NEXT: omp.loop_nest
+      omp.loop_nest (%iv) : index = (%idx) to (%idx) step (%idx) {
+        omp.yield
+      }
+    // CHECK: } {attr_in_dict}
+    } {attr_in_dict}
+    omp.terminator
+  }
+  return
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
index fcc5299e39d77..12bfee9fb6511 100644
--- a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
@@ -8,21 +8,21 @@ func.func @access_chain_struct() -> () {
   %0 = spirv.Constant 1: i32
   %1 = spirv.Variable : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>
   // CHECK: spirv.AccessChain {{.*}}[{{.*}}, {{.*}}] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4 x f32>)>, Function>
-  %2 = spirv.AccessChain %1[%0, %0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32
+  %2 = spirv.AccessChain %1[%0, %0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32 -> !spirv.ptr<f32, Function>
   return
 }
 
 func.func @access_chain_1D_array(%arg0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4xf32>, Function>
   // CHECK: spirv.AccessChain {{.*}}[{{.*}}] : !spirv.ptr<!spirv.array<4 x f32>, Function>
-  %1 = spirv.AccessChain %0[%arg0] : !spirv.ptr<!spirv.array<4xf32>, Function>, i32
+  %1 = spirv.AccessChain %0[%arg0] : !spirv.ptr<!spirv.array<4xf32>, Function>, i32 -> !spirv.ptr<f32, Function>
   return
 }
 
 func.func @access_chain_2D_array_1(%arg0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
   // CHECK: spirv.AccessChain {{.*}}[{{.*}}, {{.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32>>, Function>
-  %1 = spirv.AccessChain %0[%arg0, %arg0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32
+  %1 = spirv.AccessChain %0[%arg0, %arg0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32 -> !spirv.ptr<f32, Function>
   %2 = spirv.Load "Function" %1 ["Volatile"] : f32
   return
 }
@@ -30,7 +30,7 @@ func.func @access_chain_2D_array_1(%arg0 : i32) -> () {
 func.func @access_chain_2D_array_2(%arg0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
   // CHECK: spirv.AccessChain {{.*}}[{{.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32>>, Function>
-  %1 = spirv.AccessChain %0[%arg0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32
+  %1 = spirv.AccessChain %0[%arg0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32 -> !spirv.ptr<!spirv.array<4xf32>, Function>
   %2 = spirv.Load "Function" %1 ["Volatile"] : !spirv.array<4xf32>
   return
 }
@@ -38,7 +38,7 @@ func.func @access_chain_2D_array_2(%arg0 : i32) -> () {
 func.func @access_chain_rtarray(%arg0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.rtarray<f32>, Function>
   // CHECK: spirv.AccessChain {{.*}}[{{.*}}] : !spirv.ptr<!spirv.rtarray<f32>, Function>
-  %1 = spirv.AccessChain %0[%arg0] : !spirv.ptr<!spirv.rtarray<f32>, Function>, i32
+  %1 = spirv.AccessChain %0[%arg0] : !spirv.ptr<!spirv.rtarray<f32>, Function>, i32 -> !spirv.ptr<f32, Function>
   %2 = spirv.Load "Function" %1 ["Volatile"] : f32
   return
 }
@@ -49,7 +49,7 @@ func.func @access_chain_non_composite() -> () {
   %0 = spirv.Constant 1: i32
   %1 = spirv.Variable : !spirv.ptr<f32, Function>
   // expected-error @+1 {{cannot extract from non-composite type 'f32' with index 0}}
-  %2 = spirv.AccessChain %1[%0] : !spirv.ptr<f32, Function>, i32
+  %2 = spirv.AccessChain %1[%0] : !spirv.ptr<f32, Function>, i32 -> !spirv.ptr<f32, Function>
   return
 }
 
@@ -57,8 +57,8 @@ func.func @access_chain_non_composite() -> () {
 
 func.func @access_chain_no_indices(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
-  // expected-error @+1 {{expected at least one index}}
-  %1 = spirv.AccessChain %0[] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32
+  // expected-error @+1 {{custom op 'spirv.AccessChain' 0 operands present, but expected 1}}
+  %1 = spirv.AccessChain %0[] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32 -> !spirv.ptr<f32, Function>
   return
 }
 
@@ -75,8 +75,8 @@ func.func @access_chain_missing_comma(%index0 : i32) -> () {
 
 func.func @access_chain_invalid_indices_types_count(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
-  // expected-error @+1 {{'spirv.AccessChain' op indices types' count must be equal to indices info count}}
-  %1 = spirv.AccessChain %0[%index0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32
+  // expected-error @+1 {{custom op 'spirv.AccessChain' 1 operands present, but expected 2}}
+  %1 = spirv.AccessChain %0[%index0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32 -> !spirv.ptr<!spirv.array<4xf32>, Function>
   return
 }
 
@@ -84,8 +84,8 @@ func.func @access_chain_invalid_indices_types_count(%index0 : i32) -> () {
 
 func.func @access_chain_missing_indices_type(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
-  // expected-error @+1 {{'spirv.AccessChain' op indices types' count must be equal to indices info count}}
-  %1 = spirv.AccessChain %0[%index0, %index0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32
+  // expected-error @+1 {{custom op 'spirv.AccessChain' 2 operands present, but expected 1}}
+  %1 = spirv.AccessChain %0[%index0, %index0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32 -> !spirv.ptr<f32, Function>
   return
 }
 
@@ -94,8 +94,8 @@ func.func @access_chain_missing_indices_type(%index0 : i32) -> () {
 func.func @access_chain_invalid_type(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
   %1 = spirv.Load "Function" %0 ["Volatile"] : !spirv.array<4x!spirv.array<4xf32>>
-  // expected-error @+1 {{expected a pointer to composite type, but provided '!spirv.array<4 x !spirv.array<4 x f32>>'}}
-  %2 = spirv.AccessChain %1[%index0] : !spirv.array<4x!spirv.array<4xf32>>, i32
+  // expected-error @+1 {{'spirv.AccessChain' op operand #0 must be any SPIR-V pointer type, but got '!spirv.array<4 x !spirv.array<4 x f32>>'}}
+  %2 = spirv.AccessChain %1[%index0] : !spirv.array<4x!spirv.array<4xf32>>, i32 -> f32
   return
 }
 
@@ -113,7 +113,7 @@ func.func @access_chain_invalid_index_1(%index0 : i32) -> () {
 func.func @access_chain_invalid_index_2(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>
   // expected-error @+1 {{index must be an integer spirv.Constant to access element of spirv.struct}}
-  %1 = spirv.AccessChain %0[%index0, %index0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32
+  %1 = spirv.AccessChain %0[%index0, %index0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32 -> !spirv.ptr<f32, Function>
   return
 }
 
@@ -123,7 +123,7 @@ func.func @access_chain_invalid_constant_type_1() -> () {
   %0 = arith.constant 1: i32
   %1 = spirv.Variable : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>
   // expected-error @+1 {{index must be an integer spirv.Constant to access element of spirv.struct, but provided arith.constant}}
-  %2 = spirv.AccessChain %1[%0, %0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32
+  %2 = spirv.AccessChain %1[%0, %0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32 -> !spirv.ptr<f32, Function>
   return
 }
 
@@ -133,7 +133,7 @@ func.func @access_chain_out_of_bounds() -> () {
   %index0 = "spirv.Constant"() { value = 12: i32} : () -> i32
   %0 = spirv.Variable : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>
   // expected-error @+1 {{'spirv.AccessChain' op index 12 out of bounds for '!spirv.struct<(f32, !spirv.array<4 x f32>)>'}}
-  %1 = spirv.AccessChain %0[%index0, %index0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32
+  %1 = spirv.AccessChain %0[%index0, %index0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Function>, i32, i32 -> !spirv.ptr<f32, Function>
   return
 }
 
@@ -142,9 +142,9 @@ func.func @access_chain_out_of_bounds() -> () {
 func.func @access_chain_invalid_accessing_type(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
   // expected-error @+1 {{cannot extract from non-composite type 'f32' with index 0}}
-  %1 = spirv.AccessChain %0[%index, %index0, %index0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32, i32
+  %1 = spirv.AccessChain %0[%index0, %index0, %index0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32, i32 -> !spirv.ptr<f32, Function>
   return
-
+}
 // -----
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
index 1eed5892a0857..5e98b9fdb3c54 100644
--- a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
@@ -11,7 +11,7 @@ spirv.module Logical GLSL450 {
     // CHECK: [[VAR1:%.*]] = spirv.mlir.addressof @var1 : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4 x f32>)>, Input>
     // CHECK-NEXT: spirv.AccessChain [[VAR1]][{{.*}}, {{.*}}] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4 x f32>)>, Input>
     %1 = spirv.mlir.addressof @var1 : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Input>
-    %2 = spirv.AccessChain %1[%0, %0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Input>, i32, i32
+    %2 = spirv.AccessChain %1[%0, %0] : !spirv.ptr<!spirv.struct<(f32, !spirv.array<4xf32>)>, Input>, i32, i32 -> !spirv.ptr<f32, Input>
     spirv.Return
   }
 }
diff --git a/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir b/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir
index 6a5edc7f1781b..4fdb6799c97fa 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir
@@ -103,14 +103,14 @@ spirv.module Logical GLSL450 {
     %37 = spirv.IAdd %arg4, %11 : i32
     // CHECK: spirv.AccessChain [[ARG0]]
     %c0 = spirv.Constant 0 : i32
-    %38 = spirv.AccessChain %arg0[%c0, %36, %37] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x !spirv.array<4 x f32>>)>, StorageBuffer>, i32, i32, i32
+    %38 = spirv.AccessChain %arg0[%c0, %36, %37] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x !spirv.array<4 x f32>>)>, StorageBuffer>, i32, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %39 = spirv.Load "StorageBuffer" %38 : f32
     // CHECK: spirv.AccessChain [[ARG1]]
-    %40 = spirv.AccessChain %arg1[%c0, %36, %37] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x !spirv.array<4 x f32>>)>, StorageBuffer>, i32, i32, i32
+    %40 = spirv.AccessChain %arg1[%c0, %36, %37] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x !spirv.array<4 x f32>>)>, StorageBuffer>, i32, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %41 = spirv.Load "StorageBuffer" %40 : f32
     %42 = spirv.FAdd %39, %41 : f32
     // CHECK: spirv.AccessChain [[ARG2]]
-    %43 = spirv.AccessChain %arg2[%c0, %36, %37] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x !spirv.array<4 x f32>>)>, StorageBuffer>, i32, i32, i32
+    %43 = spirv.AccessChain %arg2[%c0, %36, %37] : !spirv.ptr<!spirv.struct<(!spirv.array<12 x !spirv.array<4 x f32>>)>, StorageBuffer>, i32, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     spirv.Store "StorageBuffer" %43, %42 : f32
     spirv.Return
   }
diff --git a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
index d07389d6822ce..3a775e209903c 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
@@ -11,8 +11,8 @@ func.func @combine_full_access_chain() -> f32 {
   // CHECK-NEXT: spirv.Load "Function" %[[PTR]]
   %c0 = spirv.Constant 0: i32
   %0 = spirv.Variable : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>
-  %1 = spirv.AccessChain %0[%c0] : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>, i32
-  %2 = spirv.AccessChain %1[%c0, %c0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32
+  %1 = spirv.AccessChain %0[%c0] : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>, i32 -> !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
+  %2 = spirv.AccessChain %1[%c0, %c0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32 -> !spirv.ptr<f32, Function>
   %3 = spirv.Load "Function" %2 : f32
   spirv.ReturnValue %3 : f32
 }
@@ -28,9 +28,9 @@ func.func @combine_access_chain_multi_use() -> !spirv.array<4xf32> {
   // CHECK-NEXT: spirv.Load "Function" %[[PTR_1]]
   %c0 = spirv.Constant 0: i32
   %0 = spirv.Variable : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>
-  %1 = spirv.AccessChain %0[%c0] : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>, i32
-  %2 = spirv.AccessChain %1[%c0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32
-  %3 = spirv.AccessChain %2[%c0] : !spirv.ptr<!spirv.array<4xf32>, Function>, i32
+  %1 = spirv.AccessChain %0[%c0] : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>, i32 -> !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
+  %2 = spirv.AccessChain %1[%c0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32 -> !spirv.ptr<!spirv.array<4xf32>, Function>
+  %3 = spirv.AccessChain %2[%c0] : !spirv.ptr<!spirv.array<4xf32>, Function>, i32 -> !spirv.ptr<f32, Function>
   %4 = spirv.Load "Function" %2 : !spirv.array<4xf32>
   %5 = spirv.Load "Function" %3 : f32
   spirv.ReturnValue %4: !spirv.array<4xf32>
@@ -49,8 +49,8 @@ func.func @dont_combine_access_chain_without_common_base() -> !spirv.array<4xi32
   %c1 = spirv.Constant 1: i32
   %0 = spirv.Variable : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>
   %1 = spirv.Variable : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>
-  %2 = spirv.AccessChain %0[%c1] : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>, i32
-  %3 = spirv.AccessChain %1[%c1] : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>, i32
+  %2 = spirv.AccessChain %0[%c1] : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>, i32 -> !spirv.ptr<!spirv.array<4xi32>, Function>
+  %3 = spirv.AccessChain %1[%c1] : !spirv.ptr<!spirv.struct<(!spirv.array<4x!spirv.array<4xf32>>, !spirv.array<4xi32>)>, Function>, i32 -> !spirv.ptr<!spirv.array<4xi32>, Function>
   %4 = spirv.Load "Function" %2 : !spirv.array<4xi32>
   %5 = spirv.Load "Function" %3 : !spirv.array<4xi32>
   spirv.ReturnValue %4 : !spirv.array<4xi32>
diff --git a/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir b/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir
index 3aadb19ec1582..bd3c665013136 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir
@@ -37,7 +37,7 @@ spirv.module Logical GLSL450 {
   spirv.func @callee() "None" {
     %0 = spirv.mlir.addressof @data : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32> [0])>, StorageBuffer>
     %1 = spirv.Constant 0: i32
-    %2 = spirv.AccessChain %0[%1, %1] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32> [0])>, StorageBuffer>, i32, i32
+    %2 = spirv.AccessChain %0[%1, %1] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i32, StorageBuffer>
     spirv.Branch ^next
 
   ^next:
@@ -196,7 +196,7 @@ spirv.module Logical GLSL450 {
     // CHECK: [[VAL:%.*]] = spirv.Load "StorageBuffer" [[LOADPTR]]
     %2 = spirv.mlir.addressof @arg_0 : !spirv.ptr<!spirv.struct<(i32 [0])>, StorageBuffer>
     %3 = spirv.mlir.addressof @arg_1 : !spirv.ptr<!spirv.struct<(i32 [0])>, StorageBuffer>
-    %4 = spirv.AccessChain %2[%1] : !spirv.ptr<!spirv.struct<(i32 [0])>, StorageBuffer>, i32
+    %4 = spirv.AccessChain %2[%1] : !spirv.ptr<!spirv.struct<(i32 [0])>, StorageBuffer>, i32 -> !spirv.ptr<i32, StorageBuffer>
     %5 = spirv.Load "StorageBuffer" %4 : i32
     %6 = spirv.SGreaterThan %5, %1 : i32
     // CHECK: spirv.mlir.selection
@@ -204,7 +204,7 @@ spirv.module Logical GLSL450 {
       spirv.BranchConditional %6, ^bb1, ^bb2
     ^bb1: // pred: ^bb0
       // CHECK: [[STOREPTR:%.*]] = spirv.AccessChain [[ADDRESS_ARG1]]
-      %7 = spirv.AccessChain %3[%1] : !spirv.ptr<!spirv.struct<(i32 [0])>, StorageBuffer>, i32
+      %7 = spirv.AccessChain %3[%1] : !spirv.ptr<!spirv.struct<(i32 [0])>, StorageBuffer>, i32 -> !spirv.ptr<i32, StorageBuffer>
       // CHECK-NOT: spirv.FunctionCall
       // CHECK: spirv.AtomicIAdd <Device> <AcquireRelease> [[STOREPTR]], [[VAL]]
       // CHECK: spirv.Branch
diff --git a/mlir/test/Dialect/SPIRV/Transforms/layout-decoration.mlir b/mlir/test/Dialect/SPIRV/Transforms/layout-decoration.mlir
index d2c9f832346c1..656bd43c6ed9f 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/layout-decoration.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/layout-decoration.mlir
@@ -24,7 +24,7 @@ spirv.module Logical GLSL450 {
     // CHECK: {{%.*}} = spirv.mlir.addressof @var0 : !spirv.ptr<!spirv.struct<(i32 [0], !spirv.struct<(f32 [0], i32 [4])> [4], f32 [12])>, Uniform>
     %0 = spirv.mlir.addressof @var0 : !spirv.ptr<!spirv.struct<(i32, !spirv.struct<(f32, i32)>, f32)>, Uniform>
     // CHECK:  {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}] : !spirv.ptr<!spirv.struct<(i32 [0], !spirv.struct<(f32 [0], i32 [4])> [4], f32 [12])>, Uniform>
-    %1 = spirv.AccessChain %0[%c0] : !spirv.ptr<!spirv.struct<(i32, !spirv.struct<(f32, i32)>, f32)>, Uniform>, i32
+    %1 = spirv.AccessChain %0[%c0] : !spirv.ptr<!spirv.struct<(i32, !spirv.struct<(f32, i32)>, f32)>, Uniform>, i32 -> !spirv.ptr<i32, Uniform>
     spirv.Return
   }
 }
diff --git a/mlir/test/Dialect/SPIRV/Transforms/unify-aliased-resource.mlir b/mlir/test/Dialect/SPIRV/Transforms/unify-aliased-resource.mlir
index ac9589ba24323..f5cd490c164d7 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/unify-aliased-resource.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/unify-aliased-resource.mlir
@@ -7,7 +7,7 @@ spirv.module Logical GLSL450 {
   spirv.func @load_store_scalar(%index: i32) -> f32 "None" {
     %c0 = spirv.Constant 0 : i32
     %addr = spirv.mlir.addressof @var01s : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %value = spirv.Load "StorageBuffer" %ac : f32
     spirv.Store "StorageBuffer" %ac, %value : f32
     spirv.ReturnValue %value : f32
@@ -39,7 +39,7 @@ spirv.module Logical GLSL450 {
   spirv.func @load_store_scalar_64bit(%index: i64) -> f32 "None" {
     %c0 = spirv.Constant 0 : i64
     %addr = spirv.mlir.addressof @var01s : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i64, i64
+    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i64, i64 -> !spirv.ptr<f32, StorageBuffer>
     %value = spirv.Load "StorageBuffer" %ac : f32
     spirv.Store "StorageBuffer" %ac, %value : f32
     spirv.ReturnValue %value : f32
@@ -66,9 +66,9 @@ spirv.module Logical GLSL450 {
   spirv.func @multiple_uses(%i0: i32, %i1: i32) -> f32 "None" {
     %c0 = spirv.Constant 0 : i32
     %addr = spirv.mlir.addressof @var01s : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac0 = spirv.AccessChain %addr[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac0 = spirv.AccessChain %addr[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %val0 = spirv.Load "StorageBuffer" %ac0 : f32
-    %ac1 = spirv.AccessChain %addr[%c0, %i1] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac1 = spirv.AccessChain %addr[%c0, %i1] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %val1 = spirv.Load "StorageBuffer" %ac1 : f32
     %value = spirv.FAdd %val0, %val1 : f32
     spirv.ReturnValue %value : f32
@@ -95,7 +95,7 @@ spirv.module Logical GLSL450 {
   spirv.func @vector3(%index: i32) -> f32 "None" {
     %c0 = spirv.Constant 0 : i32
     %addr = spirv.mlir.addressof @var01s : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %value = spirv.Load "StorageBuffer" %ac : f32
     spirv.ReturnValue %value : f32
   }
@@ -116,7 +116,7 @@ spirv.module Logical GLSL450 {
   spirv.func @not_aliased(%index: i32) -> f32 "None" {
     %c0 = spirv.Constant 0 : i32
     %addr = spirv.mlir.addressof @var01s : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %value = spirv.Load "StorageBuffer" %ac : f32
     spirv.Store "StorageBuffer" %ac, %value : f32
     spirv.ReturnValue %value : f32
@@ -141,15 +141,15 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr0 = spirv.mlir.addressof @var01s : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac0 = spirv.AccessChain %addr0[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac0 = spirv.AccessChain %addr0[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %val0 = spirv.Load "StorageBuffer" %ac0 : f32
 
     %addr1 = spirv.mlir.addressof @var01s_1 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac1 = spirv.AccessChain %addr1[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac1 = spirv.AccessChain %addr1[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %val1 = spirv.Load "StorageBuffer" %ac1 : f32
 
     %addr2 = spirv.mlir.addressof @var01v_1 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
-    %ac2 = spirv.AccessChain %addr2[%c0, %index, %c0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32, i32
+    %ac2 = spirv.AccessChain %addr2[%c0, %index, %c0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %val2 = spirv.Load "StorageBuffer" %ac2 : f32
 
     %add0 = spirv.FAdd %val0, %val1 : f32
@@ -182,11 +182,11 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr0 = spirv.mlir.addressof @var01s_i32 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32, stride=4> [0])>, StorageBuffer>
-    %ac0 = spirv.AccessChain %addr0[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac0 = spirv.AccessChain %addr0[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i32, StorageBuffer>
     %val0 = spirv.Load "StorageBuffer" %ac0 : i32
 
     %addr1 = spirv.mlir.addressof @var01s_f32 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac1 = spirv.AccessChain %addr1[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac1 = spirv.AccessChain %addr1[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     spirv.Store "StorageBuffer" %ac1, %val1 : f32
 
     spirv.ReturnValue %val0 : i32
@@ -219,7 +219,7 @@ spirv.module Logical GLSL450 {
   spirv.func @different_primitive_type(%index: i32, %val0: i32) -> i32 "None" {
     %c0 = spirv.Constant 0 : i32
     %addr = spirv.mlir.addressof @var01s : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32, stride=4> [0])>, StorageBuffer>
-    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac = spirv.AccessChain %addr[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i32, StorageBuffer>
     %val1 = spirv.Load "StorageBuffer" %ac : i32
     spirv.Store "StorageBuffer" %ac, %val0 : i32
     spirv.ReturnValue %val1 : i32
@@ -251,7 +251,7 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr0 = spirv.mlir.addressof @var01s_i64 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=4> [0])>, StorageBuffer>
-    %ac0 = spirv.AccessChain %addr0[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac0 = spirv.AccessChain %addr0[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i64, StorageBuffer>
     %val0 = spirv.Load "StorageBuffer" %ac0 : i64
 
     spirv.ReturnValue %val0 : i64
@@ -292,13 +292,13 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr0 = spirv.mlir.addressof @var01s_f32 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac0 = spirv.AccessChain %addr0[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac0 = spirv.AccessChain %addr0[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %f32val = spirv.Load "StorageBuffer" %ac0 : f32
     %f64val = spirv.FConvert %f32val : f32 to f64
     %i64val = spirv.Bitcast %f64val : f64 to i64
 
     %addr1 = spirv.mlir.addressof @var01s_i64 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=4> [0])>, StorageBuffer>
-    %ac1 = spirv.AccessChain %addr1[%c0, %i1] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac1 = spirv.AccessChain %addr1[%c0, %i1] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i64, StorageBuffer>
     // expected-error@+1 {{failed to legalize operation 'spirv.Store'}}
     spirv.Store "StorageBuffer" %ac1, %i64val : i64
 
@@ -317,11 +317,11 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr0 = spirv.mlir.addressof @var01_vec4 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
-    %ac0 = spirv.AccessChain %addr0[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
+    %ac0 = spirv.AccessChain %addr0[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<vector<4xf32>, StorageBuffer>
     %vec4val = spirv.Load "StorageBuffer" %ac0 : vector<4xf32>
 
     %addr1 = spirv.mlir.addressof @var01_scalar : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac1 = spirv.AccessChain %addr1[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac1 = spirv.AccessChain %addr1[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %scalarval = spirv.Load "StorageBuffer" %ac1 : f32
 
     %val = spirv.CompositeInsert %scalarval, %vec4val[0 : i32] : f32 into vector<4xf32>
@@ -368,15 +368,15 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr0 = spirv.mlir.addressof @var01_v4f32 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
-    %ac0 = spirv.AccessChain %addr0[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
+    %ac0 = spirv.AccessChain %addr0[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<vector<4xf32>, StorageBuffer>
     %vec4val = spirv.Load "StorageBuffer" %ac0 : vector<4xf32>
 
     %addr1 = spirv.mlir.addressof @var01_f32 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>
-    %ac1 = spirv.AccessChain %addr1[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac1 = spirv.AccessChain %addr1[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %f32val = spirv.Load "StorageBuffer" %ac1 : f32
 
     %addr2 = spirv.mlir.addressof @var01_i64 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=8> [0])>, StorageBuffer>
-    %ac2 = spirv.AccessChain %addr2[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=8> [0])>, StorageBuffer>, i32, i32
+    %ac2 = spirv.AccessChain %addr2[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=8> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i64, StorageBuffer>
     %i64val = spirv.Load "StorageBuffer" %ac2 : i64
     %i32val = spirv.SConvert %i64val : i64 to i32
     %castval = spirv.Bitcast %i32val : i32 to f32
@@ -433,7 +433,7 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr = spirv.mlir.addressof @var01_i64 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=8> [0])>, StorageBuffer>
-    %ac = spirv.AccessChain %addr[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=8> [0])>, StorageBuffer>, i32, i32
+    %ac = spirv.AccessChain %addr[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i64, stride=8> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i64, StorageBuffer>
     %val = spirv.Load "StorageBuffer" %ac : i64
 
     spirv.ReturnValue %val : i64
@@ -462,7 +462,7 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr = spirv.mlir.addressof @var01_i16 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i16, stride=2> [0])>, StorageBuffer>
-    %ac = spirv.AccessChain %addr[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i16, stride=2> [0])>, StorageBuffer>, i32, i32
+    %ac = spirv.AccessChain %addr[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<i16, stride=2> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i16, StorageBuffer>
     %val = spirv.Load "StorageBuffer" %ac : i16
 
     spirv.ReturnValue %val : i16
@@ -486,7 +486,7 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
 
     %addr = spirv.mlir.addressof @var00_v4f32 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
-    %ac = spirv.AccessChain %addr[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
+    %ac = spirv.AccessChain %addr[%c0, %i0] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<vector<4xf32>, StorageBuffer>
     %val = spirv.Load "StorageBuffer" %ac :  vector<4xf32>
 
     spirv.ReturnValue %val : vector<4xf32>
@@ -516,11 +516,11 @@ spirv.module Logical GLSL450 {
     %c0 = spirv.Constant 0 : i32
     %v0 = spirv.Constant dense<0.0> : vector<3xf32>
     %addr0 = spirv.mlir.addressof @var01_v2f16 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<2xf16>, stride=4> [0])>, StorageBuffer>
-    %ac0 = spirv.AccessChain %addr0[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<2xf16>, stride=4> [0])>, StorageBuffer>, i32, i32
+    %ac0 = spirv.AccessChain %addr0[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<2xf16>, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<vector<2xf16>, StorageBuffer>
     %value0 = spirv.Load "StorageBuffer" %ac0 : vector<2xf16>
 
     %addr1 = spirv.mlir.addressof @var01_v2f32 : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<2xf32>, stride=8> [0])>, StorageBuffer>
-    %ac1 = spirv.AccessChain %addr1[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<2xf32>, stride=8> [0])>, StorageBuffer>, i32, i32
+    %ac1 = spirv.AccessChain %addr1[%c0, %index] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<2xf32>, stride=8> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<vector<2xf32>, StorageBuffer>
     %value1 = spirv.Load "StorageBuffer" %ac1 : vector<2xf32>
 
     %val0_as_f32 = spirv.Bitcast %value0 : vector<2xf16> to f32
@@ -554,7 +554,7 @@ spirv.module Logical GLSL450 {
 spirv.module Logical GLSL450 {
   spirv.func @main(%arg0: !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>) "None" {
     %cst0_i32 = spirv.Constant 0 : i32
-    %0 = spirv.AccessChain %arg0[%cst0_i32, %cst0_i32] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %0 = spirv.AccessChain %arg0[%cst0_i32, %cst0_i32] : !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     spirv.Return
   }
 }
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 460ac3947f5c8..3a40b462b8270 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -111,9 +111,9 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
 
-    // func-bufferize can be applied only to ModuleOps.
+    // duplicate-function-elimination can be applied only to ModuleOps.
     // expected-error @below {{pass pipeline failed}}
-    transform.apply_registered_pass "func-bufferize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "duplicate-function-elimination" to %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Vector/eliminate-masks.mlir b/mlir/test/Dialect/Vector/eliminate-masks.mlir
index 0b89b0604faab..0b78687fb9832 100644
--- a/mlir/test/Dialect/Vector/eliminate-masks.mlir
+++ b/mlir/test/Dialect/Vector/eliminate-masks.mlir
@@ -5,16 +5,22 @@
 // CHECK-LABEL: @eliminate_redundant_masks_through_insert_and_extracts
 // CHECK: %[[ALL_TRUE_MASK:.*]] = vector.constant_mask [4] : vector<[4]xi1>
 // CHECK: vector.transfer_read {{.*}} %[[ALL_TRUE_MASK]]
+// CHECK: vector.mask %[[ALL_TRUE_MASK:.*]] {
+// CHECK-SAME:  vector.outerproduct
 // CHECK: vector.transfer_write {{.*}} %[[ALL_TRUE_MASK]]
-func.func @eliminate_redundant_masks_through_insert_and_extracts(%tensor: tensor<1x1000xf32>) {
-  %c0 = arith.constant 0 : index
+#map = affine_map<()[s0] -> (-(1080 mod s0) + 1080)>
+
+func.func @eliminate_redundant_masks_through_insert_and_extracts(%tensor: tensor<1x1000xf32>, %rhs : f32) {
   %c4 = arith.constant 4 : index
-  %c1000 = arith.constant 1000 : index
-  %c0_f32 = arith.constant 0.0 : f32
   %vscale = vector.vscale
   %c4_vscale = arith.muli %vscale, %c4 : index
+  %ub = affine.apply #map()[%c4_vscale]
+
+  %c0 = arith.constant 0 : index
+  %c1000 = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
   %extracted_slice_0 = tensor.extract_slice %tensor[0, 0] [1, %c4_vscale] [1, 1] : tensor<1x1000xf32> to tensor<1x?xf32>
-  %output_tensor = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice_0) -> tensor<1x?xf32> {
+  %output_tensor = scf.for %i = %c0 to %ub step %c4_vscale iter_args(%arg = %extracted_slice_0) -> tensor<1x?xf32> {
     // 1. Extract a slice.
     %extracted_slice_1 = tensor.extract_slice %arg[0, %i] [1, %c4_vscale] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
 
@@ -23,8 +29,8 @@ func.func @eliminate_redundant_masks_through_insert_and_extracts(%tensor: tensor
     %mask = vector.create_mask %dim_1 : vector<[4]xi1>
 
     // 3. Read the slice and do some computation.
-    %vec = vector.transfer_read %extracted_slice_1[%c0], %c0_f32, %mask {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32>
-    %new_vec = "test.some_computation"(%vec) : (vector<[4]xf32>) -> (vector<[4]xf32>)
+    %lhs = vector.transfer_read %extracted_slice_1[%c0], %c0_f32, %mask {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32>
+    %new_vec = vector.mask %mask { vector.outerproduct %lhs, %rhs {kind = #vector.kind<add>} : vector<[4]xf32>, f32 } : vector<[4]xi1> -> vector<[4]xf32>
 
     // 4. Write the new value.
     %write = vector.transfer_write %new_vec, %extracted_slice_1[%c0], %mask {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32>
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
index b1a0d4f924f3c..721c8a8d5d203 100644
--- a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
+++ b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
@@ -42,7 +42,7 @@ func.func @vector_transfer_read_i2() -> vector<3xi2> {
 
 // -----
 
-func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
+func.func @vector_constant_mask_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
   %0 = memref.alloc() : memref<3x5xi2>
   %cst = arith.constant dense<0> : vector<3x5xi2>
   %mask = vector.constant_mask [3] : vector<5xi1>
@@ -54,7 +54,7 @@ func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
   return %2 : vector<3x5xi2>
 }
 
-// CHECK-LABEL: func @vector_cst_maskedload_i2(
+// CHECK-LABEL: func @vector_constant_mask_maskedload_i2(
 // CHECK-SAME: %[[ARG0:.+]]: vector<5xi2>) -> vector<3x5xi2>
 // CHECK: %[[ORIGINMASK:.+]] = vector.constant_mask [3] : vector<5xi1>
 // CHECK: %[[NEWMASK:.+]] = arith.constant dense<true> : vector<2xi1>
@@ -74,6 +74,55 @@ func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
 
 // -----
 
+// This tests the correctness of generating compressed mask with `vector.create_mask` on a static input and dynamic indices.
+// Specifically, the program masked loads a vector<5xi2> from `vector<3x5xi2>[1, 0]`, with an unknown mask generator `m`.
+// After emulation transformation, it masked loads 2 bytes from linearized index `vector<4xi8>[1]`, with a new compressed mask
+// given by `ceildiv(m + 1, 4)`.
+func.func @unaligned_create_mask_dynamic_i2(%m : index, %passthru: vector<5xi2>) -> vector<5xi2> {
+    %0 = memref.alloc() : memref<3x5xi2>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %mask = vector.create_mask %m : vector<5xi1>
+    %1 = vector.maskedload %0[%c1, %c0], %mask, %passthru :
+      memref<3x5xi2>, vector<5xi1>, vector<5xi2> into vector<5xi2>
+    return %1 : vector<5xi2>
+}
+
+// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> ((s0 + 1) ceildiv 4)>
+// CHECK: func @unaligned_create_mask_dynamic_i2(
+// CHECK-SAME:  %[[NUM_ELEMS_TO_LOAD:.+]]: index, %[[PASSTHRU:.+]]: vector<5xi2>)
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<4xi8>
+// CHECK: %[[COMPRESSED_MASK:.+]] = affine.apply #map()[%[[NUM_ELEMS_TO_LOAD]]]
+// CHECK: vector.create_mask %[[COMPRESSED_MASK]] : vector<2xi1>
+// CHECK: %[[C1:.+]] = arith.constant 1 : index
+// CHECK: vector.maskedload %[[ALLOC]][%[[C1]]]
+
+// -----
+
+// This tests the correctness of generated compressed mask with `vector.create_mask`, and a static input.
+// Quite the same as the previous test, but the mask generator is a static value.
+// In this case, the desired slice `vector<7xi2>` spans over 3 bytes.
+func.func @check_unaligned_create_mask_static_i2(%passthru: vector<7xi2>) -> vector<7xi2> {
+    %0 = memref.alloc() : memref<3x7xi2>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %mask = vector.create_mask %c3 : vector<7xi1>
+    %1 = vector.maskedload %0[%c1, %c0], %mask, %passthru :
+      memref<3x7xi2>, vector<7xi1>, vector<7xi2> into vector<7xi2>
+    return %1 : vector<7xi2>
+}
+
+// CHECK: func @check_unaligned_create_mask_static_i2(
+// CHECK-SAME:     %[[PASSTHRU:[a-zA-Z0-9]+]]: vector<7xi2>)
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<6xi8>
+// CHECK: %[[C2:.+]] = arith.constant 2 : index
+// CHECK: %[[COMP_MASK:.+]] = vector.create_mask %[[C2]] : vector<3xi1>
+// CHECK: %[[C1:.+]] = arith.constant 1 : index
+// CHECK: %4 = vector.maskedload %[[ALLOC]][%[[C1]]], %[[COMP_MASK]]
+
+// -----
+
 func.func @vector_load_i2_dynamic_indexing(%idx1: index, %idx2: index) -> vector<3xi2> {
   %0 = memref.alloc() : memref<3x3xi2>
   %cst = arith.constant dense<0> : vector<3x3xi2>
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
index 7a3ba95893383..9dc3eb6989c6c 100644
--- a/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
@@ -141,7 +141,7 @@ func.func @vector_maskedload_i8(%arg1: index, %arg2: index, %arg3: index, %passt
 // CHECK-NEXT:   return
 
 //  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 4)>
-//  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 3) floordiv 4)>
+//  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
 //      CHECK32: func @vector_maskedload_i8(
 // CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
 // CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<4xi8>)
@@ -169,7 +169,7 @@ func.func @vector_maskedload_i4(%arg1: index, %arg2: index, %arg3: index, %passt
     return %2 : vector<3x8xi4>
 }
 //  CHECK-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
-//  CHECK-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 1) floordiv 2)>
+//  CHECK-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)>
 //      CHECK: func @vector_maskedload_i4(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<8xi4>)
@@ -185,7 +185,7 @@ func.func @vector_maskedload_i4(%arg1: index, %arg2: index, %arg3: index, %passt
 //      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<8xi1>, vector<8xi4>
 
 //  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
-//  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 7) floordiv 8)>
+//  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)>
 //      CHECK32: func @vector_maskedload_i4(
 // CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
 // CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<8xi4>)
@@ -497,7 +497,7 @@ func.func @vector_maskedstore_i8(%arg0: index, %arg1: index, %arg2: index, %valu
 // CHECK-NEXT:   return
 
 // CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 2 + s1 floordiv 4)>
-// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 3) floordiv 4)>
+// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
 // CHECK32:     func @vector_maskedstore_i8(
 // CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
 // CHECK32-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
@@ -530,7 +530,7 @@ func.func @vector_maskedstore_i4(
     return
 }
 // CHECK: #[[$ATTR_10:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
-// CHECK: #[[$ATTR_11:.+]] = affine_map<()[s0] -> ((s0 + 1) floordiv 2)>
+// CHECK: #[[$ATTR_11:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)>
 
 // CHECK-LABEL:   func.func @vector_maskedstore_i4(
 // CHECK-SAME:      %[[IDX_1:[a-zA-Z0-9]+]]: index,
@@ -550,7 +550,7 @@ func.func @vector_maskedstore_i4(
 // CHECK:           vector.maskedstore %[[ALLOC]]{{\[}}%[[LIDX]]], %[[NEW_MASK]], %[[NEW_VAL]] : memref<12xi8>, vector<4xi1>, vector<4xi8>
 
 // CHECK32: #[[$ATTR_17:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
-// CHECK32: #[[$ATTR_18:.+]] = affine_map<()[s0] -> ((s0 + 7) floordiv 8)>
+// CHECK32: #[[$ATTR_18:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)>
 
 // CHECK32-LABEL:   func.func @vector_maskedstore_i4(
 // CHECK32-SAME:      %[[IDX_1:[a-zA-Z0-9]+]]: index,
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index 89e8ca1d93109..de12a87253a67 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -433,3 +433,16 @@ func.func @vec_0D(%arg0: vector<f32>) -> vector<i32> {
   %0 = vector.bitcast %arg0 : vector<f32> to vector<i32>
   return %0 : vector<i32>
 }
+
+// Make sure not crash on dynamic index `vector.extract`:
+func.func @vector_extract_dynamic_index(%arg0 : vector<4xi32>, %index : index) -> i16 {
+  %0 = vector.bitcast %arg0 : vector<4xi32> to vector<8xi16>
+  %1 = vector.extract %0[%index] : i16 from vector<8xi16>
+  return %1 : i16
+}
+
+// CHECK-LABEL: func.func @vector_extract_dynamic_index
+// CHECK-SAME: (%[[VEC:.+]]: vector<4xi32>, %[[IDX:.+]]: index) -> i16 {
+// CHECK: %[[BC:.+]] = vector.bitcast %[[VEC]] : vector<4xi32> to vector<8xi16>
+// CHECK: %[[EXTRACT:.+]] = vector.extract %[[BC]][%[[IDX]]] : i16 from vector<8xi16>
+// CHECK: return %[[EXTRACT]]
diff --git a/mlir/test/IR/attribute.mlir b/mlir/test/IR/attribute.mlir
index a62de3f5004d7..0085d64ae82b6 100644
--- a/mlir/test/IR/attribute.mlir
+++ b/mlir/test/IR/attribute.mlir
@@ -561,6 +561,14 @@ func.func @correct_type_pass() {
 
 // -----
 
+func.func @tf32_elements_attr() {
+  // CHECK: "foo"() {attr = dense<4.000000e+00> : tensor<tf32>} : () -> ()
+  "foo"() {attr = dense<4.0> : tensor<tf32>} : () -> ()
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // Test StringElementsAttr
 //===----------------------------------------------------------------------===//
@@ -675,6 +683,14 @@ func.func @dense_array_attr() attributes {
 
 // -----
 
+func.func @test_invalid_bitwidth_type() {
+  // expected-error @below{{element type bitwidth must be a multiple of 8}}
+  "foo"() {tf32attr = array<tf32: 1024.0>} : () -> ()
+  return
+}
+
+// -----
+
 func.func @testConfinedDenseArrayAttr() {
   "test.confined_dense_array_attr"() {
     i64attr = array<i64: 0, 2, 3>,
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir
index bec1b9a4e9d82..0428ada86041d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir
@@ -1,6 +1,6 @@
 // DEFINE: %{compile} =  mlir-opt %s \
 // DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule |\
-// DEFINE:  mlir-opt --test-linalg-transform-patterns="test-generalize-tensor-pack"\
+// DEFINE:  mlir-opt --test-linalg-transform-patterns="test-decompose-tensor-pack"\
 // DEFINE:    --test-transform-dialect-erase-schedule \
 // DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" \
 // DEFINE:    -buffer-deallocation-pipeline="private-function-dynamic-ownership" \
diff --git a/mlir/test/Integration/Dialect/MemRef/verify-memref.mlir b/mlir/test/Integration/Dialect/MemRef/verify-memref.mlir
index 431ae0a89d20c..f1ef69ffb75ef 100644
--- a/mlir/test/Integration/Dialect/MemRef/verify-memref.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/verify-memref.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: -func-bufferize -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \
+// RUN:   -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \
 // RUN:   -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm\
 // RUN:   -convert-func-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner \
@@ -8,17 +8,17 @@
 // RUN: FileCheck %s
 
 module {
-  func.func private @verifyMemRefI8(%a : tensor<*xi8>, %b : tensor<*xi8>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefI16(%a : tensor<*xi16>, %b : tensor<*xi16>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefI32(%a : tensor<*xi32>, %b : tensor<*xi32>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefI64(%a : tensor<*xi64>, %b : tensor<*xi64>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefBF16(%a : tensor<*xbf16>, %b : tensor<*xbf16>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefF16(%a : tensor<*xf16>, %b : tensor<*xf16>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefF32(%a : tensor<*xf32>, %b : tensor<*xf32>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefF64(%a : tensor<*xf64>, %b : tensor<*xf64>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefC32(%a : tensor<*xcomplex<f32>>, %b : tensor<*xcomplex<f32>>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefC64(%a : tensor<*xcomplex<f64>>, %b : tensor<*xcomplex<f64>>) -> i64 attributes { llvm.emit_c_interface }
-  func.func private @verifyMemRefInd(%a : tensor<*xindex>, %b : tensor<*xindex>) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefI8(%a : tensor<*xi8> {bufferization.access = "read"}, %b : tensor<*xi8> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefI16(%a : tensor<*xi16> {bufferization.access = "read"}, %b : tensor<*xi16> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefI32(%a : tensor<*xi32> {bufferization.access = "read"}, %b : tensor<*xi32> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefI64(%a : tensor<*xi64> {bufferization.access = "read"}, %b : tensor<*xi64> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefBF16(%a : tensor<*xbf16> {bufferization.access = "read"}, %b : tensor<*xbf16> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefF16(%a : tensor<*xf16> {bufferization.access = "read"}, %b : tensor<*xf16> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefF32(%a : tensor<*xf32> {bufferization.access = "read"}, %b : tensor<*xf32> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefF64(%a : tensor<*xf64> {bufferization.access = "read"}, %b : tensor<*xf64> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefC32(%a : tensor<*xcomplex<f32>> {bufferization.access = "read"}, %b : tensor<*xcomplex<f32>> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefC64(%a : tensor<*xcomplex<f64>> {bufferization.access = "read"}, %b : tensor<*xcomplex<f64>> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
+  func.func private @verifyMemRefInd(%a : tensor<*xindex> {bufferization.access = "read"}, %b : tensor<*xindex> {bufferization.access = "read"}) -> i64 attributes { llvm.emit_c_interface }
 
   func.func @entry() {
     %i8 = arith.constant dense<90> : tensor<3x3xi8>
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index e5ea03ff7e001..a4a3581d6b759 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -586,6 +586,28 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 2
 // CHECK:     {ptr @kernel_func, !"reqntidz", i32 32}
 // -----
 
+llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32: 3, 5, 7>} {
+  llvm.return
+}
+
+// CHECK:     !nvvm.annotations =
+// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
+// CHECK:     {ptr @kernel_func, !"cluster_dim_x", i32 3}
+// CHECK:     {ptr @kernel_func, !"cluster_dim_y", i32 5}
+// CHECK:     {ptr @kernel_func, !"cluster_dim_z", i32 7}
+// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
+// -----
+
+llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} {
+  llvm.return
+}
+
+// CHECK:     !nvvm.annotations =
+// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
+// CHECK:     {ptr @kernel_func, !"cluster_max_blocks", i32 8}
+// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
+// -----
+
 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.minctasm = 16} {
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 2f34070147be4..0620c23b5fdad 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -118,6 +118,25 @@ llvm.func @rocdl.ballot64(%pred : i1) -> i64 {
   llvm.return %0 : i64
 }
 
+llvm.func @rocdl.readlane(%src0 : f32, %src1: f64, %src2: i32, %src3: vector<2 x f32>) -> f32 {
+  %idx = llvm.mlir.constant(0 : i32) : i32
+
+  // CHECK-LABEL: rocdl.readlane
+  // CHECK: call float @llvm.amdgcn.readlane.f32(float %{{.*}}, i32 0)
+  %0 = rocdl.readlane %src0, %idx : (f32, i32) -> f32
+
+  // CHECK: call double @llvm.amdgcn.readlane.f64(double %{{.*}}, i32 0)
+  %1 = rocdl.readlane %src1, %idx : (f64, i32) -> f64
+
+  // CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %{{.*}}, i32 0)
+  %2 = rocdl.readlane %src2, %idx : (i32, i32) -> i32
+
+  // CHECK: call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %{{.*}}, i32 0)
+  %3 = rocdl.readlane %src3, %idx : (vector<2 x f32>, i32) -> vector<2 x f32>
+
+  llvm.return %0 : f32
+}
+
 llvm.func @rocdl.waitcnt() {
   // CHECK-LABEL: rocdl.waitcnt
   // CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0)
diff --git a/mlir/test/Target/SPIRV/array-two-step-roundtrip.mlir b/mlir/test/Target/SPIRV/array-two-step-roundtrip.mlir
index 427b926527240..1a3bc88633d0d 100644
--- a/mlir/test/Target/SPIRV/array-two-step-roundtrip.mlir
+++ b/mlir/test/Target/SPIRV/array-two-step-roundtrip.mlir
@@ -3,7 +3,7 @@
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   spirv.func @array_stride(%arg0 : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, %arg1 : i32, %arg2 : i32) "None" {
     // CHECK: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32, stride=4>, stride=128>, StorageBuffer>, i32, i32
-    %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, i32, i32
+    %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     spirv.Return
   }
 }
diff --git a/mlir/test/Target/SPIRV/array.mlir b/mlir/test/Target/SPIRV/array.mlir
index c01b295b1abe4..56908e687a914 100644
--- a/mlir/test/Target/SPIRV/array.mlir
+++ b/mlir/test/Target/SPIRV/array.mlir
@@ -3,7 +3,7 @@
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   spirv.func @array_stride(%arg0 : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, %arg1 : i32, %arg2 : i32) "None" {
     // CHECK: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32, stride=4>, stride=128>, StorageBuffer>, i32, i32
-    %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, i32, i32
+    %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32, stride=4>, stride=128>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     spirv.Return
   }
 }
diff --git a/mlir/test/Target/SPIRV/debug.mlir b/mlir/test/Target/SPIRV/debug.mlir
index d1cd71d65ca8d..58bf364593fc9 100644
--- a/mlir/test/Target/SPIRV/debug.mlir
+++ b/mlir/test/Target/SPIRV/debug.mlir
@@ -58,7 +58,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
   spirv.func @memory_accesses(%arg0 : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, StorageBuffer>, %arg1 : i32, %arg2 : i32) "None" {
     // CHECK: loc({{".*debug.mlir"}}:61:10)
-    %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, StorageBuffer>, i32, i32
+    %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     // CHECK: loc({{".*debug.mlir"}}:63:10)
     %3 = spirv.Load "StorageBuffer" %2 : f32
     // CHECK: loc({{.*debug.mlir"}}:65:5)
diff --git a/mlir/test/Target/SPIRV/global-variable.mlir b/mlir/test/Target/SPIRV/global-variable.mlir
index 28b2706d3d163..a70ed316c68d3 100644
--- a/mlir/test/Target/SPIRV/global-variable.mlir
+++ b/mlir/test/Target/SPIRV/global-variable.mlir
@@ -54,7 +54,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.mlir.addressof @globalInvocationID : !spirv.ptr<vector<3xi32>, Input>
     %1 = spirv.Constant 0: i32
     // CHECK: spirv.AccessChain %[[ADDR]]
-    %2 = spirv.AccessChain %0[%1] : !spirv.ptr<vector<3xi32>, Input>, i32
+    %2 = spirv.AccessChain %0[%1] : !spirv.ptr<vector<3xi32>, Input>, i32 -> !spirv.ptr<i32, Input>
     spirv.Return
   }
 }
diff --git a/mlir/test/Target/SPIRV/loop.mlir b/mlir/test/Target/SPIRV/loop.mlir
index 08039ccc822df..d89600558f56d 100644
--- a/mlir/test/Target/SPIRV/loop.mlir
+++ b/mlir/test/Target/SPIRV/loop.mlir
@@ -69,9 +69,9 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   spirv.func @loop_kernel() "None" {
     %0 = spirv.mlir.addressof @GV1 : !spirv.ptr<!spirv.struct<(!spirv.array<10 x f32, stride=4> [0])>, StorageBuffer>
     %1 = spirv.Constant 0 : i32
-    %2 = spirv.AccessChain %0[%1] : !spirv.ptr<!spirv.struct<(!spirv.array<10 x f32, stride=4> [0])>, StorageBuffer>, i32
+    %2 = spirv.AccessChain %0[%1] : !spirv.ptr<!spirv.struct<(!spirv.array<10 x f32, stride=4> [0])>, StorageBuffer>, i32 -> !spirv.ptr<!spirv.array<10 x f32, stride=4>, StorageBuffer>
     %3 = spirv.mlir.addressof @GV2 : !spirv.ptr<!spirv.struct<(!spirv.array<10 x f32, stride=4> [0])>, StorageBuffer>
-    %5 = spirv.AccessChain %3[%1] : !spirv.ptr<!spirv.struct<(!spirv.array<10 x f32, stride=4> [0])>, StorageBuffer>, i32
+    %5 = spirv.AccessChain %3[%1] : !spirv.ptr<!spirv.struct<(!spirv.array<10 x f32, stride=4> [0])>, StorageBuffer>, i32 -> !spirv.ptr<!spirv.array<10 x f32, stride=4>, StorageBuffer>
     %6 = spirv.Constant 4 : i32
     %7 = spirv.Constant 42 : i32
     %8 = spirv.Constant 2 : i32
@@ -88,9 +88,9 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
       spirv.BranchConditional %10, ^body, ^merge
 // CHECK-NEXT:   ^bb2:     // pred: ^bb1
     ^body:
-      %11 = spirv.AccessChain %2[%9] : !spirv.ptr<!spirv.array<10 x f32, stride=4>, StorageBuffer>, i32
+      %11 = spirv.AccessChain %2[%9] : !spirv.ptr<!spirv.array<10 x f32, stride=4>, StorageBuffer>, i32 -> !spirv.ptr<f32, StorageBuffer>
       %12 = spirv.Load "StorageBuffer" %11 : f32
-      %13 = spirv.AccessChain %5[%9] : !spirv.ptr<!spirv.array<10 x f32, stride=4>, StorageBuffer>, i32
+      %13 = spirv.AccessChain %5[%9] : !spirv.ptr<!spirv.array<10 x f32, stride=4>, StorageBuffer>, i32 -> !spirv.ptr<f32, StorageBuffer>
       spirv.Store "StorageBuffer" %13, %12 : f32
 // CHECK:          %[[ADD:.*]] = spirv.IAdd
       %14 = spirv.IAdd %9, %8 : i32
diff --git a/mlir/test/Target/SPIRV/matrix.mlir b/mlir/test/Target/SPIRV/matrix.mlir
index b52c3f4aa2f11..2a391df4bff39 100644
--- a/mlir/test/Target/SPIRV/matrix.mlir
+++ b/mlir/test/Target/SPIRV/matrix.mlir
@@ -4,7 +4,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   // CHECK-LABEL: @matrix_access_chain
   spirv.func @matrix_access_chain(%arg0 : !spirv.ptr<!spirv.matrix<3 x vector<3xf32>>, Function>, %arg1 : i32) -> !spirv.ptr<vector<3xf32>, Function> "None" {
     // CHECK: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}] : !spirv.ptr<!spirv.matrix<3 x vector<3xf32>>, Function>
-    %0 = spirv.AccessChain %arg0[%arg1] : !spirv.ptr<!spirv.matrix<3 x vector<3xf32>>,Function>, i32
+    %0 = spirv.AccessChain %arg0[%arg1] : !spirv.ptr<!spirv.matrix<3 x vector<3xf32>>, Function>, i32 -> !spirv.ptr<vector<3xf32>, Function>
     spirv.ReturnValue %0 : !spirv.ptr<vector<3xf32>, Function>
   }
 
diff --git a/mlir/test/Target/SPIRV/memory-ops.mlir b/mlir/test/Target/SPIRV/memory-ops.mlir
index f7abdabeac3eb..6b50c3921d427 100644
--- a/mlir/test/Target/SPIRV/memory-ops.mlir
+++ b/mlir/test/Target/SPIRV/memory-ops.mlir
@@ -28,8 +28,8 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   spirv.func @access_chain(%arg0 : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, %arg1 : i32, %arg2 : i32) "None" {
     // CHECK: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32>>, Function>
     // CHECK-NEXT: {{%.*}} = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.array<4 x !spirv.array<4 x f32>>, Function>
-    %1 = spirv.AccessChain %arg0[%arg1] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32
-    %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32
+    %1 = spirv.AccessChain %arg0[%arg1] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32 -> !spirv.ptr<!spirv.array<4xf32>, Function>
+    %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32 -> !spirv.ptr<f32, Function>
     spirv.Return
   }
 }
@@ -41,13 +41,13 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     // CHECK: [[LOAD_PTR:%.*]] = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>
     // CHECK-NEXT: [[VAL:%.*]] = spirv.Load "StorageBuffer" [[LOAD_PTR]] : f32
     %0 = spirv.Constant 0 : i32
-    %1 = spirv.AccessChain %arg0[%0, %0] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %1 = spirv.AccessChain %arg0[%0, %0] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     %2 = spirv.Load "StorageBuffer" %1 : f32
 
     // CHECK: [[STORE_PTR:%.*]] = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>
     // CHECK-NEXT: spirv.Store "StorageBuffer" [[STORE_PTR]], [[VAL]] : f32
     %3 = spirv.Constant 0 : i32
-    %4 = spirv.AccessChain %arg1[%3, %3] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %4 = spirv.AccessChain %arg1[%3, %3] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
     spirv.Store "StorageBuffer" %4, %2 : f32
     spirv.Return
   }
@@ -56,13 +56,13 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     // CHECK: [[LOAD_PTR:%.*]] = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x i32, stride=4> [0])>
     // CHECK-NEXT: [[VAL:%.*]] = spirv.Load "StorageBuffer" [[LOAD_PTR]] : i32
     %0 = spirv.Constant 0 : i32
-    %1 = spirv.AccessChain %arg0[%0, %0] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x i32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %1 = spirv.AccessChain %arg0[%0, %0] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x i32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i32, StorageBuffer>
     %2 = spirv.Load "StorageBuffer" %1 : i32
 
     // CHECK: [[STORE_PTR:%.*]] = spirv.AccessChain {{%.*}}[{{%.*}}, {{%.*}}] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x i32, stride=4> [0])>
     // CHECK-NEXT: spirv.Store "StorageBuffer" [[STORE_PTR]], [[VAL]] : i32
     %3 = spirv.Constant 0 : i32
-    %4 = spirv.AccessChain %arg1[%3, %3] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x i32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %4 = spirv.AccessChain %arg1[%3, %3] : !spirv.ptr<!spirv.struct<(!spirv.array<1 x i32, stride=4> [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr<i32, StorageBuffer>
     spirv.Store "StorageBuffer" %4, %2 : i32
     spirv.Return
   }
diff --git a/mlir/test/Target/SPIRV/physical-storage-buffer.mlir b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
index 040cfb891cb31..7cbd3f94e55ff 100644
--- a/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
+++ b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
@@ -26,17 +26,17 @@ spirv.module PhysicalStorageBuffer64 GLSL450 requires #spirv.vce<v1.5,
     %idx1 = spirv.Constant 1 : i64
     %idx2 = spirv.Constant 2 : i64
     %set_0_addr = spirv.mlir.addressof @set_0 : !set_0_ptr
-    %s0_b2_ptr = spirv.AccessChain %set_0_addr[%idx2] : !set_0_ptr, i64
+    %s0_b2_ptr = spirv.AccessChain %set_0_addr[%idx2] : !set_0_ptr, i64 -> !spirv.ptr<!f32_binding_ptr, StorageBuffer>
     %b2_ptr = spirv.Load "StorageBuffer" %s0_b2_ptr : !f32_binding_ptr
-    %b2_data_ptr = spirv.AccessChain %b2_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64
+    %b2_data_ptr = spirv.AccessChain %b2_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64 -> !spirv.ptr<f32, PhysicalStorageBuffer>
 
     // CHECK: spirv.Load "PhysicalStorageBuffer"
     %b2_data = spirv.Load "PhysicalStorageBuffer" %b2_data_ptr ["Aligned", 4] : f32
 
     %set_1_addr = spirv.mlir.addressof @set_1 : !set_1_ptr
-    %s1_b1_ptr = spirv.AccessChain %set_1_addr[%idx1] : !set_1_ptr, i64
+    %s1_b1_ptr = spirv.AccessChain %set_1_addr[%idx1] : !set_1_ptr, i64 -> !spirv.ptr<!f32_binding_ptr, StorageBuffer>
     %b1_ptr = spirv.Load "StorageBuffer" %s1_b1_ptr : !f32_binding_ptr
-    %b1_data_ptr = spirv.AccessChain %b1_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64
+    %b1_data_ptr = spirv.AccessChain %b1_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64 -> !spirv.ptr<f32, PhysicalStorageBuffer>
 
     // CHECK: spirv.Store "PhysicalStorageBuffer"
     spirv.Store "PhysicalStorageBuffer" %b1_data_ptr, %b2_data ["Aligned", 4] : f32
diff --git a/mlir/test/Target/SPIRV/undef.mlir b/mlir/test/Target/SPIRV/undef.mlir
index 217018184429c..b9044fe8b40af 100644
--- a/mlir/test/Target/SPIRV/undef.mlir
+++ b/mlir/test/Target/SPIRV/undef.mlir
@@ -16,7 +16,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     // CHECK: {{%.*}} = spirv.Undef : !spirv.ptr<!spirv.struct<(f32)>, StorageBuffer>
     %7 = spirv.Undef : !spirv.ptr<!spirv.struct<(f32)>, StorageBuffer>
     %8 = spirv.Constant 0 : i32
-    %9 = spirv.AccessChain %7[%8] : !spirv.ptr<!spirv.struct<(f32)>, StorageBuffer>, i32
+    %9 = spirv.AccessChain %7[%8] : !spirv.ptr<!spirv.struct<(f32)>, StorageBuffer>, i32 -> !spirv.ptr<f32, StorageBuffer>
     spirv.Return
   }
 }
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
index 5899f56da7345..c65e68eaf31f0 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -74,13 +74,13 @@ struct TestLinalgTransforms
       *this, "test-generalize-pad-tensor",
       llvm::cl::desc("Test transform pad tensor by copying with generic ops"),
       llvm::cl::init(false)};
-  Option<bool> testGeneralizeTensorPackOp{
-      *this, "test-generalize-tensor-pack",
+  Option<bool> testDecomposeTensorPackOp{
+      *this, "test-decompose-tensor-pack",
       llvm::cl::desc("Test transform that generalizes pack ops into a sequence "
                      "of tensor and Linalg ops"),
       llvm::cl::init(false)};
-  Option<bool> testGeneralizeTensorUnPackOp{
-      *this, "test-generalize-tensor-unpack",
+  Option<bool> testDecomposeTensorUnPackOp{
+      *this, "test-decompose-tensor-unpack",
       llvm::cl::desc(
           "Test transform that generalizes unpack ops into a sequence "
           "of tensor and Linalg ops"),
@@ -172,15 +172,15 @@ static void applyGeneralizePadTensorPatterns(func::FuncOp funcOp) {
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }
 
-static void applyGeneralizeTensorPackPatterns(func::FuncOp funcOp) {
+static void applyDecomposeTensorPackPatterns(func::FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
-  patterns.add<GeneralizeOuterUnitDimsPackOpPattern>(funcOp.getContext());
+  patterns.add<DecomposeOuterUnitDimsPackOpPattern>(funcOp.getContext());
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }
 
-static void applyGeneralizeTensorUnPackPatterns(func::FuncOp funcOp) {
+static void applyDecomposeTensorUnPackPatterns(func::FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
-  patterns.add<GeneralizeOuterUnitDimsUnPackOpPattern>(funcOp.getContext());
+  patterns.add<DecomposeOuterUnitDimsUnPackOpPattern>(funcOp.getContext());
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }
 
@@ -237,10 +237,10 @@ void TestLinalgTransforms::runOnOperation() {
     return applyLinalgToVectorPatterns(getOperation());
   if (testGeneralizePadTensor)
     return applyGeneralizePadTensorPatterns(getOperation());
-  if (testGeneralizeTensorPackOp)
-    return applyGeneralizeTensorPackPatterns(getOperation());
-  if (testGeneralizeTensorUnPackOp)
-    return applyGeneralizeTensorUnPackPatterns(getOperation());
+  if (testDecomposeTensorPackOp)
+    return applyDecomposeTensorPackPatterns(getOperation());
+  if (testDecomposeTensorUnPackOp)
+    return applyDecomposeTensorUnPackPatterns(getOperation());
   if (testSwapSubTensorPadTensor)
     return applyExtractSliceOfPadTensorSwapPattern(getOperation());
   if (testBubbleUpExtractSliceOpPattern)
diff --git a/mlir/test/mlir-cpu-runner/simple.mlir b/mlir/test/mlir-cpu-runner/simple.mlir
index 38d9dcaf55371..f7f73d17d8282 100644
--- a/mlir/test/mlir-cpu-runner/simple.mlir
+++ b/mlir/test/mlir-cpu-runner/simple.mlir
@@ -1,15 +1,22 @@
-// RUN: mlir-cpu-runner %s | FileCheck %s
-// RUN: mlir-cpu-runner %s -e foo | FileCheck -check-prefix=NOMAIN %s
-// RUN: mlir-cpu-runner %s --entry-point-result=i32 -e int32_main | FileCheck -check-prefix=INT32MAIN %s
-// RUN: mlir-cpu-runner %s --entry-point-result=i64 -e int64_main | FileCheck -check-prefix=INT64MAIN %s
-// RUN: mlir-cpu-runner %s -O3 | FileCheck %s
+// RUN: mlir-cpu-runner %s %if target={{s390x-.*}} %{ -argext-abi-check=false %} \
+// RUN:   | FileCheck %s
+// RUN: mlir-cpu-runner %s -e foo %if target={{s390x-.*}} %{ -argext-abi-check=false %} \
+// RUN:   | FileCheck -check-prefix=NOMAIN %s
+// RUN: mlir-cpu-runner %s --entry-point-result=i32 -e int32_main %if target={{s390x-.*}} \
+// RUN:   %{ -argext-abi-check=false %} | FileCheck -check-prefix=INT32MAIN %s
+// RUN: mlir-cpu-runner %s --entry-point-result=i64 -e int64_main %if target={{s390x-.*}} \
+// RUN:   %{ -argext-abi-check=false %} | FileCheck -check-prefix=INT64MAIN %s
+// RUN: mlir-cpu-runner %s -O3 %if target={{s390x-.*}} %{ -argext-abi-check=false %} \
+// RUN:   | FileCheck %s
 
 // RUN: cp %s %t
-// RUN: mlir-cpu-runner %t -dump-object-file | FileCheck %t
+// RUN: mlir-cpu-runner %t -dump-object-file %if target={{s390x-.*}} \
+// RUN:   %{ -argext-abi-check=false %} | FileCheck %t
 // RUN: ls %t.o
 // RUN: rm %t.o
 
-// RUN: mlir-cpu-runner %s -dump-object-file -object-filename=%T/test.o | FileCheck %s
+// RUN: mlir-cpu-runner %s -dump-object-file -object-filename=%T/test.o \
+// RUN:   %if target={{s390x-.*}} %{ -argext-abi-check=false %} | FileCheck %s
 // RUN: ls %T/test.o
 // RUN: rm %T/test.o
 
diff --git a/mlir/test/python/dialects/affine.py b/mlir/test/python/dialects/affine.py
index 7faae6ccedc97..7ef128c1724c4 100644
--- a/mlir/test/python/dialects/affine.py
+++ b/mlir/test/python/dialects/affine.py
@@ -50,7 +50,7 @@ def testAffineDelinearizeInfer():
     # CHECK: %[[C1:.*]] = arith.constant 1 : index
     c1 = arith.ConstantOp(T.index(), 1)
     # CHECK: %{{.*}}:2 = affine.delinearize_index %[[C1:.*]] into (2, 3) : index, index
-    two_indices = affine.AffineDelinearizeIndexOp(c1, [], [2, 3])
+    two_indices = affine.AffineDelinearizeIndexOp([T.index()] * 2, c1, [], [2, 3])
 
 
 # CHECK-LABEL: TEST: testAffineLoadOp
@@ -157,7 +157,7 @@ def testAffineForOpErrors():
         )
 
     try:
-        two_indices = affine.AffineDelinearizeIndexOp(c1, [], [1, 1])
+        two_indices = affine.AffineDelinearizeIndexOp([T.index()] * 2, c1, [], [1, 1])
         affine.AffineForOp(
             two_indices,
             c2,
diff --git a/mlir/test/python/dialects/quant.py b/mlir/test/python/dialects/quant.py
index 0ee3327dec152..b1d6e85f519b5 100644
--- a/mlir/test/python/dialects/quant.py
+++ b/mlir/test/python/dialects/quant.py
@@ -108,9 +108,9 @@ def test_uniform_per_axis_type():
             ),
         )
 
-        # CHECK: scales: None
+        # CHECK: scales: [200.0, 0.99872]
         print(f"scales: {per_axis.scales}")
-        # CHECK: zero_points: None
+        # CHECK: zero_points: [0, 120]
         print(f"zero_points: {per_axis.zero_points}")
         # CHECK: quantized dim: 1
         print(f"quantized dim: {per_axis.quantized_dimension}")
diff --git a/mlir/test/python/ir/builtin_types.py b/mlir/test/python/ir/builtin_types.py
index 48ddc8359ca0a..6ce0fc12d8082 100644
--- a/mlir/test/python/ir/builtin_types.py
+++ b/mlir/test/python/ir/builtin_types.py
@@ -639,6 +639,7 @@ def testTypeIDs():
             (BF16Type, BF16Type.get()),
             (F16Type, F16Type.get()),
             (F32Type, F32Type.get()),
+            (FloatTF32Type, FloatTF32Type.get()),
             (F64Type, F64Type.get()),
             (NoneType, NoneType.get()),
             (ComplexType, ComplexType.get(f32)),
@@ -668,6 +669,7 @@ def testTypeIDs():
         # CHECK: BF16Type(bf16)
         # CHECK: F16Type(f16)
         # CHECK: F32Type(f32)
+        # CHECK: FloatTF32Type(tf32)
         # CHECK: F64Type(f64)
         # CHECK: NoneType(none)
         # CHECK: ComplexType(complex<f32>)
@@ -734,6 +736,9 @@ def print_downcasted(typ):
         # CHECK: F32Type
         # CHECK: F32Type(f32)
         print_downcasted(F32Type.get())
+        # CHECK: FloatTF32Type
+        # CHECK: FloatTF32Type(tf32)
+        print_downcasted(FloatTF32Type.get())
         # CHECK: F64Type
         # CHECK: F64Type(f64)
         print_downcasted(F64Type.get())
diff --git a/mlir/test/python/ir/value.py b/mlir/test/python/ir/value.py
index 50b0e8403a7f2..9a8146bd9350b 100644
--- a/mlir/test/python/ir/value.py
+++ b/mlir/test/python/ir/value.py
@@ -148,6 +148,77 @@ def testValueReplaceAllUsesWith():
         print(f"Use operand_number: {use.operand_number}")
 
 
+# CHECK-LABEL: TEST: testValueReplaceAllUsesWithExcept
+@run
+def testValueReplaceAllUsesWithExcept():
+    ctx = Context()
+    ctx.allow_unregistered_dialects = True
+    with Location.unknown(ctx):
+        i32 = IntegerType.get_signless(32)
+        module = Module.create()
+        with InsertionPoint(module.body):
+            value = Operation.create("custom.op1", results=[i32]).results[0]
+            op1 = Operation.create("custom.op1", operands=[value])
+            op2 = Operation.create("custom.op2", operands=[value])
+            value2 = Operation.create("custom.op3", results=[i32]).results[0]
+            value.replace_all_uses_except(value2, op1)
+
+    assert len(list(value.uses)) == 1
+
+    # CHECK: Use owner: "custom.op2"
+    # CHECK: Use operand_number: 0
+    for use in value2.uses:
+        assert use.owner in [op2]
+        print(f"Use owner: {use.owner}")
+        print(f"Use operand_number: {use.operand_number}")
+
+    # CHECK: Use owner: "custom.op1"
+    # CHECK: Use operand_number: 0
+    for use in value.uses:
+        assert use.owner in [op1]
+        print(f"Use owner: {use.owner}")
+        print(f"Use operand_number: {use.operand_number}")
+
+
+# CHECK-LABEL: TEST: testValueReplaceAllUsesWithMultipleExceptions
+@run
+def testValueReplaceAllUsesWithMultipleExceptions():
+    ctx = Context()
+    ctx.allow_unregistered_dialects = True
+    with Location.unknown(ctx):
+        i32 = IntegerType.get_signless(32)
+        module = Module.create()
+        with InsertionPoint(module.body):
+            value = Operation.create("custom.op1", results=[i32]).results[0]
+            op1 = Operation.create("custom.op1", operands=[value])
+            op2 = Operation.create("custom.op2", operands=[value])
+            op3 = Operation.create("custom.op3", operands=[value])
+            value2 = Operation.create("custom.op4", results=[i32]).results[0]
+
+            # Replace all uses of `value` with `value2`, except for `op1` and `op2`.
+            value.replace_all_uses_except(value2, [op1, op2])
+
+    # After replacement, only `op3` should use `value2`, while `op1` and `op2` should still use `value`.
+    assert len(list(value.uses)) == 2
+    assert len(list(value2.uses)) == 1
+
+    # CHECK: Use owner: "custom.op3"
+    # CHECK: Use operand_number: 0
+    for use in value2.uses:
+        assert use.owner in [op3]
+        print(f"Use owner: {use.owner}")
+        print(f"Use operand_number: {use.operand_number}")
+
+    # CHECK: Use owner: "custom.op2"
+    # CHECK: Use operand_number: 0
+    # CHECK: Use owner: "custom.op1"
+    # CHECK: Use operand_number: 0
+    for use in value.uses:
+        assert use.owner in [op1, op2]
+        print(f"Use owner: {use.owner}")
+        print(f"Use operand_number: {use.operand_number}")
+
+
 # CHECK-LABEL: TEST: testValuePrintAsOperand
 @run
 def testValuePrintAsOperand():
diff --git a/mlir/unittests/ExecutionEngine/Invoke.cpp b/mlir/unittests/ExecutionEngine/Invoke.cpp
index ff87fc9fad805..887db227cfc4b 100644
--- a/mlir/unittests/ExecutionEngine/Invoke.cpp
+++ b/mlir/unittests/ExecutionEngine/Invoke.cpp
@@ -61,12 +61,21 @@ static LogicalResult lowerToLLVMDialect(ModuleOp module) {
 }
 
 TEST(MLIRExecutionEngine, SKIP_WITHOUT_JIT(AddInteger)) {
+#ifdef __s390__
+  std::string moduleStr = R"mlir(
+  func.func @foo(%arg0 : i32 {llvm.signext}) -> (i32 {llvm.signext}) attributes { llvm.emit_c_interface } {
+    %res = arith.addi %arg0, %arg0 : i32
+    return %res : i32
+  }
+  )mlir";
+#else
   std::string moduleStr = R"mlir(
   func.func @foo(%arg0 : i32) -> i32 attributes { llvm.emit_c_interface } {
     %res = arith.addi %arg0, %arg0 : i32
     return %res : i32
   }
   )mlir";
+#endif
   DialectRegistry registry;
   registerAllDialects(registry);
   registerBuiltinDialectTranslation(registry);
@@ -259,6 +268,16 @@ TEST(NativeMemRefJit, MAYBE_JITCallback) {
   for (float &elt : *a)
     elt = count++;
 
+#ifdef __s390__
+  std::string moduleStr = R"mlir(
+  func.func private @callback(%arg0: memref<?x?xf32>, %coefficient: i32 {llvm.signext})  attributes { llvm.emit_c_interface }
+  func.func @caller_for_callback(%arg0: memref<?x?xf32>, %coefficient: i32 {llvm.signext}) attributes { llvm.emit_c_interface } {
+    %unranked = memref.cast %arg0: memref<?x?xf32> to memref<*xf32>
+    call @callback(%arg0, %coefficient) : (memref<?x?xf32>, i32) -> ()
+    return
+  }
+  )mlir";
+#else
   std::string moduleStr = R"mlir(
   func.func private @callback(%arg0: memref<?x?xf32>, %coefficient: i32)  attributes { llvm.emit_c_interface }
   func.func @caller_for_callback(%arg0: memref<?x?xf32>, %coefficient: i32) attributes { llvm.emit_c_interface } {
@@ -267,6 +286,8 @@ TEST(NativeMemRefJit, MAYBE_JITCallback) {
     return
   }
   )mlir";
+#endif
+
   DialectRegistry registry;
   registerAllDialects(registry);
   registerBuiltinDialectTranslation(registry);
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index 642aa04517809..eee9bd5f23475 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -156,3 +156,62 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToBinary)) {
     ASSERT_TRUE(!object->empty());
   }
 }
+
+// Test callback functions invoked with LLVM IR and ISA.
+TEST_F(MLIRTargetLLVMNVVM,
+       SKIP_WITHOUT_NVPTX(CallbackInvokedWithLLVMIRAndISA)) {
+  MLIRContext context(registry);
+
+  OwningOpRef<ModuleOp> module =
+      parseSourceString<ModuleOp>(moduleStr, &context);
+  ASSERT_TRUE(!!module);
+
+  NVVM::NVVMTargetAttr target = NVVM::NVVMTargetAttr::get(&context);
+
+  auto serializer = dyn_cast<gpu::TargetAttrInterface>(target);
+  ASSERT_TRUE(!!serializer);
+
+  std::string initialLLVMIR;
+  auto initialCallback = [&initialLLVMIR](llvm::Module &module) {
+    llvm::raw_string_ostream ros(initialLLVMIR);
+    module.print(ros, nullptr);
+  };
+
+  std::string linkedLLVMIR;
+  auto linkedCallback = [&linkedLLVMIR](llvm::Module &module) {
+    llvm::raw_string_ostream ros(linkedLLVMIR);
+    module.print(ros, nullptr);
+  };
+
+  std::string optimizedLLVMIR;
+  auto optimizedCallback = [&optimizedLLVMIR](llvm::Module &module) {
+    llvm::raw_string_ostream ros(optimizedLLVMIR);
+    module.print(ros, nullptr);
+  };
+
+  std::string isaResult;
+  auto isaCallback = [&isaResult](llvm::StringRef isa) {
+    isaResult = isa.str();
+  };
+
+  gpu::TargetOptions options({}, {}, {}, gpu::CompilationTarget::Assembly, {},
+                             initialCallback, linkedCallback, optimizedCallback,
+                             isaCallback);
+
+  for (auto gpuModule : (*module).getBody()->getOps<gpu::GPUModuleOp>()) {
+    std::optional<SmallVector<char, 0>> object =
+        serializer.serializeToObject(gpuModule, options);
+
+    ASSERT_TRUE(object != std::nullopt);
+    ASSERT_TRUE(!object->empty());
+    ASSERT_TRUE(!initialLLVMIR.empty());
+    ASSERT_TRUE(!linkedLLVMIR.empty());
+    ASSERT_TRUE(!optimizedLLVMIR.empty());
+    ASSERT_TRUE(!isaResult.empty());
+
+    initialLLVMIR.clear();
+    linkedLLVMIR.clear();
+    optimizedLLVMIR.clear();
+    isaResult.clear();
+  }
+}
diff --git a/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp b/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
index 0d4277ed2fdfd..63d1dbd2519be 100644
--- a/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
@@ -105,7 +105,9 @@ TargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
   // Set a dummy attr to be retrieved by `createObject`.
   module->setAttr("serialize_attr", UnitAttr::get(module->getContext()));
   std::string targetTriple = llvm::sys::getProcessTriple();
-  LLVM::ModuleToObject serializer(*module, targetTriple, "", "");
+  LLVM::ModuleToObject serializer(
+      *module, targetTriple, "", "", 3, options.getInitialLlvmIRCallback(),
+      options.getLinkedLlvmIRCallback(), options.getOptimizedLlvmIRCallback());
   return serializer.run();
 }
 
@@ -153,3 +155,88 @@ TEST_F(MLIRTargetLLVM, SKIP_WITHOUT_NATIVE(TargetAttrAPI)) {
   // `serializeToObject`.
   ASSERT_TRUE(properties.contains("serialize_attr"));
 }
+
+// Test callback function invoked with initial LLVM IR
+TEST_F(MLIRTargetLLVM, SKIP_WITHOUT_NATIVE(CallbackInvokedWithInitialLLVMIR)) {
+  MLIRContext context(registry);
+
+  OwningOpRef<ModuleOp> module =
+      parseSourceString<ModuleOp>(moduleStr, &context);
+  ASSERT_TRUE(!!module);
+  Builder builder(&context);
+  IntegerAttr target = builder.getI32IntegerAttr(0);
+  auto targetAttr = dyn_cast<gpu::TargetAttrInterface>(target);
+
+  std::string initialLLVMIR;
+  auto initialCallback = [&initialLLVMIR](llvm::Module &module) {
+    llvm::raw_string_ostream ros(initialLLVMIR);
+    module.print(ros, nullptr);
+  };
+
+  gpu::TargetOptions opts(
+      {}, {}, {}, mlir::gpu::TargetOptions::getDefaultCompilationTarget(), {},
+      initialCallback);
+  std::optional<SmallVector<char, 0>> serializedBinary =
+      targetAttr.serializeToObject(*module, opts);
+
+  ASSERT_TRUE(serializedBinary != std::nullopt);
+  ASSERT_TRUE(!serializedBinary->empty());
+  ASSERT_TRUE(!initialLLVMIR.empty());
+}
+
+// Test callback function invoked with linked LLVM IR
+TEST_F(MLIRTargetLLVM, SKIP_WITHOUT_NATIVE(CallbackInvokedWithLinkedLLVMIR)) {
+  MLIRContext context(registry);
+
+  OwningOpRef<ModuleOp> module =
+      parseSourceString<ModuleOp>(moduleStr, &context);
+  ASSERT_TRUE(!!module);
+  Builder builder(&context);
+  IntegerAttr target = builder.getI32IntegerAttr(0);
+  auto targetAttr = dyn_cast<gpu::TargetAttrInterface>(target);
+
+  std::string linkedLLVMIR;
+  auto linkedCallback = [&linkedLLVMIR](llvm::Module &module) {
+    llvm::raw_string_ostream ros(linkedLLVMIR);
+    module.print(ros, nullptr);
+  };
+
+  gpu::TargetOptions opts(
+      {}, {}, {}, mlir::gpu::TargetOptions::getDefaultCompilationTarget(), {},
+      {}, linkedCallback);
+  std::optional<SmallVector<char, 0>> serializedBinary =
+      targetAttr.serializeToObject(*module, opts);
+
+  ASSERT_TRUE(serializedBinary != std::nullopt);
+  ASSERT_TRUE(!serializedBinary->empty());
+  ASSERT_TRUE(!linkedLLVMIR.empty());
+}
+
+// Test callback function invoked with optimized LLVM IR
+TEST_F(MLIRTargetLLVM,
+       SKIP_WITHOUT_NATIVE(CallbackInvokedWithOptimizedLLVMIR)) {
+  MLIRContext context(registry);
+
+  OwningOpRef<ModuleOp> module =
+      parseSourceString<ModuleOp>(moduleStr, &context);
+  ASSERT_TRUE(!!module);
+  Builder builder(&context);
+  IntegerAttr target = builder.getI32IntegerAttr(0);
+  auto targetAttr = dyn_cast<gpu::TargetAttrInterface>(target);
+
+  std::string optimizedLLVMIR;
+  auto optimizedCallback = [&optimizedLLVMIR](llvm::Module &module) {
+    llvm::raw_string_ostream ros(optimizedLLVMIR);
+    module.print(ros, nullptr);
+  };
+
+  gpu::TargetOptions opts(
+      {}, {}, {}, mlir::gpu::TargetOptions::getDefaultCompilationTarget(), {},
+      {}, {}, optimizedCallback);
+  std::optional<SmallVector<char, 0>> serializedBinary =
+      targetAttr.serializeToObject(*module, opts);
+
+  ASSERT_TRUE(serializedBinary != std::nullopt);
+  ASSERT_TRUE(!serializedBinary->empty());
+  ASSERT_TRUE(!optimizedLLVMIR.empty());
+}
\ No newline at end of file
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 96cb79b7d071c..c76ad018ab4fe 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -43,7 +43,7 @@ set(include_directory ${devicertl_base_directory}/include)
 set(source_directory ${devicertl_base_directory}/src)
 
 set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx1010"
+                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx950;gfx1010"
                              "gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035"
                              "gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150"
                              "gfx1151;gfx1152;gfx1153")
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 423dae3957d41..affedb1a33687 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -222,7 +222,7 @@ JITEngine::backend(Module &M, const std::string &ComputeUnitKind,
     if (EC)
       return createStringError(
           EC, "Could not open %s to write the post-opt IR module\n",
-          PreOptIRModuleFileName.get().c_str());
+          PostOptIRModuleFileName.get().c_str());
     M.print(FD, nullptr);
   }
 
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 61c0bacc9f206..698e185d9c4dd 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -26,16 +26,22 @@ if(${LIBOMP_OMPT_SUPPORT})
 endif()
 
 # Generate message catalog files: kmp_i18n_id.inc and kmp_i18n_default.inc
+set(LIBOMP_MESSAGE_CONVERTER_EXTRA_ARGS "")
+if("${CMAKE_SYSTEM_NAME}" STREQUAL "Emscripten")
+  # Required as Python doesn't inherit CMake's environment setup and uses the host system as the target system by default
+  set(LIBOMP_MESSAGE_CONVERTER_EXTRA_ARGS ${LIBOMP_MESSAGE_CONVERTER_EXTRA_ARGS} --target-system-override=${CMAKE_SYSTEM_NAME})
+endif()
+
 add_custom_command(
   OUTPUT  kmp_i18n_id.inc
   COMMAND ${Python3_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/message-converter.py
-          --enum=kmp_i18n_id.inc ${LIBOMP_SRC_DIR}/i18n/en_US.txt
+          --enum=kmp_i18n_id.inc ${LIBOMP_MESSAGE_CONVERTER_EXTRA_ARGS} ${LIBOMP_SRC_DIR}/i18n/en_US.txt
   DEPENDS ${LIBOMP_SRC_DIR}/i18n/en_US.txt ${LIBOMP_TOOLS_DIR}/message-converter.py
 )
 add_custom_command(
   OUTPUT  kmp_i18n_default.inc
   COMMAND ${Python3_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/message-converter.py
-          --default=kmp_i18n_default.inc ${LIBOMP_SRC_DIR}/i18n/en_US.txt
+          --default=kmp_i18n_default.inc ${LIBOMP_MESSAGE_CONVERTER_EXTRA_ARGS} ${LIBOMP_SRC_DIR}/i18n/en_US.txt
   DEPENDS ${LIBOMP_SRC_DIR}/i18n/en_US.txt ${LIBOMP_TOOLS_DIR}/message-converter.py
 )
 
diff --git a/openmp/runtime/tools/message-converter.py b/openmp/runtime/tools/message-converter.py
index b3e0b343c65a2..a493d64c1692d 100644
--- a/openmp/runtime/tools/message-converter.py
+++ b/openmp/runtime/tools/message-converter.py
@@ -19,6 +19,32 @@
 from libomputils import ScriptError, error
 
 
+class TargetPlatform:
+    """Convenience class for handling the target platform for configuration/compilation"""
+
+    system_override = None
+    """
+    Target system name override by the user.
+    It follows the conventions from https://docs.python.org/3/library/platform.html#platform.system
+    """
+
+    def set_system_override(override_system):
+        """
+        Set a system override for the target.
+        Please follow the style from https://docs.python.org/3/library/platform.html#platform.system
+        """
+        TargetPlatform.system_override = override_system
+
+    def system():
+        """
+        Target System name.
+        It follows the conventions from https://docs.python.org/3/library/platform.html#platform.system
+        """
+        if TargetPlatform.system_override is None:
+            return platform.system()
+        return TargetPlatform.system_override
+
+
 class ParseMessageDataError(ScriptError):
     """Convenience class for parsing message data file errors"""
 
@@ -55,7 +81,7 @@ def __init__(self, lineNumber, name, text):
         self.text = text
 
     def toSrc(self):
-        if platform.system() == "Windows":
+        if TargetPlatform.system().casefold() == "Windows".casefold():
             return re.sub(r"%([0-9])\$(s|l?[du])", r"%\1!\2!", self.text)
         return str(self.text)
 
@@ -363,6 +389,13 @@ def main():
     parser.add_argument(
         "--message", metavar="FILE", help="Generate message file named FILE"
     )
+    parser.add_argument(
+        "--target-system-override",
+        metavar="TARGET_SYSTEM_NAME",
+        help="Target System override.\n"
+        "By default the target system is the host system\n"
+        "See possible values at https://docs.python.org/3/library/platform.html#platform.system",
+    )
     parser.add_argument("inputfile")
     commandArgs = parser.parse_args()
 
@@ -371,6 +404,8 @@ def main():
         return
     data = MessageData.create(commandArgs.inputfile)
     prefix = commandArgs.prefix
+    if commandArgs.target_system_override:
+        TargetPlatform.set_system_override(commandArgs.target_system_override)
     if commandArgs.enum:
         generate_enum_file(commandArgs.enum, prefix, data)
     if commandArgs.default:
@@ -378,7 +413,7 @@ def main():
     if commandArgs.signature:
         generate_signature_file(commandArgs.signature, data)
     if commandArgs.message:
-        if platform.system() == "Windows":
+        if TargetPlatform.system().casefold() == "Windows".casefold():
             generate_message_file_windows(commandArgs.message, data)
         else:
             generate_message_file_unix(commandArgs.message, data)
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 91e7e0113aa02..e2babada50051 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -357,7 +357,7 @@ gentbl(
 )
 
 gentbl(
-    name="basic_builtins_x86_gen",
+    name = "basic_builtins_x86_gen",
     tbl_outs = [(
         "-gen-clang-builtins",
         "include/clang/Basic/BuiltinsX86.inc",
@@ -1962,6 +1962,7 @@ cc_library(
         "//llvm:Support",
         "//llvm:Target",
         "//llvm:TargetParser",
+        "//llvm:TransformUtils",
         "//llvm:config",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index 9dc25f95b8e3f..ddc97119d3016 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -59,7 +59,7 @@ def libc_function(
         srcs,
         weak = False,
         copts = None,
-        local_defines = None,
+        local_defines = [],
         **kwargs):
     """Add target for a libc function.
 
@@ -108,16 +108,20 @@ def libc_function(
         name = libc_internal_target(name),
         srcs = srcs,
         copts = copts,
+        local_defines = local_defines,
         **kwargs
     )
 
     # This second target is the llvm libc C function with either a default or hidden visibility.
     # All other functions are hidden.
-    func_attrs = ["__attribute__((visibility(\"default\")))"]
-    if weak:
-        func_attrs = func_attrs + ["__attribute__((weak))"]
-    local_defines = local_defines or ["LIBC_COPT_PUBLIC_PACKAGING"]
-    local_defines = local_defines + ["LLVM_LIBC_FUNCTION_ATTR='%s'" % " ".join(func_attrs)]
+    func_attrs = [
+        "LLVM_LIBC_FUNCTION_ATTR_" + name + "='LLVM_LIBC_EMPTY, [[gnu::weak]]'",
+    ] if weak else []
+        
+    local_defines = (local_defines
+                    + ["LIBC_COPT_PUBLIC_PACKAGING"]
+                    + ["LLVM_LIBC_FUNCTION_ATTR='[[gnu::visibility(\"default\")]]'"]
+                    + func_attrs)
     _libc_library(
         name = name,
         hidden = True,
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f0469eb987bb2..c76a65e6507fa 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5621,6 +5621,7 @@ cc_library(
         ":SCFDialect",
         ":SideEffectInterfaces",
         ":Support",
+        "//llvm:Core",
         "//llvm:Support",
     ],
 )